Skip to content

Commit

Permalink
Add region sum check (#88)
Browse files Browse the repository at this point in the history
* Add failing tests of database checking

* Pass failing tests of database checking

Update tutorials a bit too

Ready to give to Matt and Dan for first look

* Update based on conversations

* Add release note

* Remove World region hardcoding

* Update tutorial notebook

* Update docstrings

* Clean up linting errors

* Attempt to pass stickler
  • Loading branch information
znicholls authored and gidden committed Oct 15, 2018
1 parent 6a5b600 commit 5e34dfc
Show file tree
Hide file tree
Showing 8 changed files with 5,353 additions and 17 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,6 @@ ENV/

# mypy
.mypy_cache/

# pytest
.pytest_cache/
1 change: 1 addition & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- (#93)[https://github.com/IAMconsortium/pyam/pull/93] IamDataFrame can be initilialzed from pd.DataFrame with index
- (#92)[https://github.com/IAMconsortium/pyam/pull/92] Adding `$` to the pseudo-regexp syntax in `pattern_match()`, adds override option
- (#90)[https://github.com/IAMconsortium/pyam/pull/90] Adding a function to `set_panel_label()` as part of the plotting library
- (#88)[https://github.com/IAMconsortium/pyam/pull/88] Adding `check_aggregate_regions` and `check_internal_consistency` to help with database validation, especially for emissions users
- (#87)[https://github.com/IAMconsortium/pyam/pull/87] Extending `rename()` to work with model and scenario names
- (#85)[https://github.com/IAMconsortium/pyam/pull/85] Improved functionality for importing metadata and bugfix for filtering for strings if `nan` values exist in metadata
- (#83)[https://github.com/IAMconsortium/pyam/pull/83] Extending `filter_by_meta()` to work with non-matching indices between `df` and `data
Expand Down
143 changes: 140 additions & 3 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
cast_years_to_int,
META_IDX,
YEAR_IDX,
REGION_IDX,
IAMC_IDX,
SORT_IDX,
LONG_IDX,
Expand Down Expand Up @@ -516,7 +517,13 @@ def check_aggregate(self, variable, components=None, units=None,
components = self.filter(variable='{}|*'.format(variable),
level=0).variables()

# filter and groupby data, use `pd.Series.align` for machting index
if not len(components):
msg = '{} - cannot check aggregate because it has no components'
logger().info(msg.format(variable))

return

# filter and groupby data, use `pd.Series.align` for matching index
df_variable, df_components = (
_aggregate_by_variables(self.data, variable, units)
.align(_aggregate_by_variables(self.data, components, units))
Expand All @@ -527,14 +534,133 @@ def check_aggregate(self, variable, components=None, units=None,
**kwargs)]

if len(diff):
msg = '{} of {} data points are not aggregates of components'
logger().info(msg.format(len(diff), len(df_variable)))
msg = '{} - {} of {} data points are not aggregates of components'
logger().info(msg.format(variable, len(diff), len(df_variable)))

if exclude_on_fail:
self._exclude_on_fail(diff.index.droplevel([2, 3]))

diff = pd.concat([diff], keys=[variable], names=['variable'])

return diff.unstack().rename_axis(None, axis=1)

def check_aggregate_regions(self, variable, region='World',
components=None, units=None,
exclude_on_fail=False, **kwargs):
"""Check whether the region timeseries data match the aggregation
of components
Parameters
----------
variable: str
variable to be checked for matching aggregation of components data
region: str
region to be checked for matching aggregation of components data
components: list of str, default None
list of regions, defaults to all regions except region
units: str or list of str, default None
filter variable and components for given unit(s)
exclude_on_fail: boolean, default False
flag scenarios failing validation as `exclude: True`
kwargs: passed to `np.isclose()`
"""
var_df = self.filter(variable=variable, level=0)

if components is None:
components = var_df.filter(region=region, keep=False).regions()

if not len(components):
msg = (
'{} - cannot check regional aggregate because it has no '
'regional components'
)
logger().info(msg.format(variable))

return None

# filter and groupby data, use `pd.Series.align` for matching index
df_region, df_components = (
_aggregate_by_regions(var_df.data, region, units)
.align(_aggregate_by_regions(var_df.data, components, units))
)

df_components.index = df_components.index.droplevel(
"variable"
)

# Add in variables that are included in region totals but which
# aren't included in the regional components.
# For example, if we are looking at World and Emissions|BC, we need
# to add aviation and shipping to the sum of Emissions|BC for each
# of World's regional components to do a valid check.
different_region = components[0]
variable_components = self.filter(
variable="{}|*".format(variable)
).variables()
for var_to_add in variable_components:
var_rows = self.data.variable == var_to_add
region_rows = self.data.region == different_region
var_has_regional_info = (var_rows & region_rows).any()
if not var_has_regional_info:
df_var_to_add = self.filter(
region=region, variable=var_to_add
).data.groupby(REGION_IDX).sum()['value']
df_var_to_add.index = df_var_to_add.index.droplevel("variable")

if len(df_var_to_add):
df_components = df_components.add(df_var_to_add,
fill_value=0)

df_components = pd.concat([df_components], keys=[variable],
names=['variable'])

# use `np.isclose` for checking match
diff = df_region[~np.isclose(df_region, df_components, **kwargs)]

if len(diff):
msg = (
'{} - {} of {} data points are not aggregates of regional '
'components'
)
logger().info(msg.format(variable, len(diff), len(df_region)))

if exclude_on_fail:
self._exclude_on_fail(diff.index.droplevel([2, 3]))

diff = pd.concat([diff], keys=[region], names=['region'])

return diff.unstack().rename_axis(None, axis=1)

def check_internal_consistency(self, **kwargs):
"""Check whether the database is internally consistent
We check that all variables are equal to the sum of their sectoral
components and that all the regions add up to the World total. If
the check is passed, None is returned, otherwise a dictionary of
inconsistent variables is returned.
Note: at the moment, this method's regional checking is limited to
checking that all the regions sum to the World region. We cannot
make this more automatic unless we start to store how the regions
relate, see
[this issue](https://github.com/IAMconsortium/pyam/issues/106).
Parameters
----------
kwargs: passed to `np.isclose()`
"""
inconsistent_vars = {}
for variable in self.variables():
diff_agg = self.check_aggregate(variable, **kwargs)
if diff_agg is not None:
inconsistent_vars[variable + "-aggregate"] = diff_agg

diff_regional = self.check_aggregate_regions(variable, **kwargs)
if diff_regional is not None:
inconsistent_vars[variable + "-regional"] = diff_regional

return inconsistent_vars if inconsistent_vars else None

def _exclude_on_fail(self, df):
"""Assign a selection of scenarios as `exclude: True` in meta"""
idx = df if isinstance(df, pd.MultiIndex) else _meta_idx(df)
Expand Down Expand Up @@ -880,6 +1006,17 @@ def _aggregate_by_variables(df, variables, units=None):
return df.groupby(YEAR_IDX).sum()['value']


def _aggregate_by_regions(df, regions, units=None):
regions = [regions] if isstr(regions) else regions
df = df[df.region.isin(regions)]

if units is not None:
units = [units] if isstr(units) else units
df = df[df.unit.isin(units)]

return df.groupby(REGION_IDX).sum()['value']


def _apply_filters(data, meta, filters):
"""Applies filters to the data and meta tables of an IamDataFrame.
Expand Down
1 change: 1 addition & 0 deletions pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
# common indicies
META_IDX = ['model', 'scenario']
YEAR_IDX = ['model', 'scenario', 'region', 'year']
REGION_IDX = ['model', 'scenario', 'variable', 'year']
IAMC_IDX = ['model', 'scenario', 'region', 'variable', 'unit']
SORT_IDX = ['model', 'scenario', 'variable', 'year', 'region']
LONG_IDX = IAMC_IDX + ['year']
Expand Down
125 changes: 125 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,119 @@
)


mg_ascen = ['MSG-GLB', 'a_scen']
mg_ascen_2 = ['MSG-GLB', 'a_scen_2']
CHECK_AGG_DF = pd.DataFrame([
['IMG', 'a_scen', 'R5ASIA', 'Primary Energy', 'EJ/y', 1, 6],
['IMG', 'a_scen', 'R5ASIA', 'Primary Energy|Coal', 'EJ/y', 0.75, 5],
['IMG', 'a_scen', 'R5ASIA', 'Primary Energy|Gas', 'EJ/y', 0.25, 1],
['IMG', 'a_scen', 'R5ASIA', 'Emissions|CO2', 'Mt CO2/yr', 3, 8],
['IMG', 'a_scen', 'R5ASIA', 'Emissions|CO2|Cars', 'Mt CO2/yr', 1, 3],
['IMG', 'a_scen', 'R5ASIA', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2, 5],
['IMG', 'a_scen', 'R5REF', 'Primary Energy', 'EJ/y', 0.3, 0.6],
['IMG', 'a_scen', 'R5REF', 'Primary Energy|Coal', 'EJ/y', 0.15, 0.4],
['IMG', 'a_scen', 'R5REF', 'Primary Energy|Gas', 'EJ/y', 0.15, 0.2],
['IMG', 'a_scen', 'R5REF', 'Emissions|CO2', 'Mt CO2/yr', 1, 1.4],
['IMG', 'a_scen', 'R5REF', 'Emissions|CO2|Cars', 'Mt CO2/yr', 0.6, 0.8],
['IMG', 'a_scen', 'R5REF', 'Emissions|CO2|Tar', 'Mt CO2/yr', 0.4, 0.6],
['IMG', 'a_scen', 'World', 'Primary Energy', 'EJ/y', 1.3, 6.6],
['IMG', 'a_scen', 'World', 'Primary Energy|Coal', 'EJ/y', 0.9, 5.4],
['IMG', 'a_scen', 'World', 'Primary Energy|Gas', 'EJ/y', 0.4, 1.2],
['IMG', 'a_scen', 'World', 'Emissions|CO2', 'Mt CO2/yr', 4, 9.4],
['IMG', 'a_scen', 'World', 'Emissions|CO2|Cars', 'Mt CO2/yr', 1.6, 3.8],
['IMG', 'a_scen', 'World', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2.4, 5.6],
['IMG', 'a_scen_2', 'R5ASIA', 'Primary Energy', 'EJ/y', 1.4, 6.4],
['IMG', 'a_scen_2', 'R5ASIA', 'Primary Energy|Coal', 'EJ/y', 0.95, 5.2],
['IMG', 'a_scen_2', 'R5ASIA', 'Primary Energy|Gas', 'EJ/y', 0.45, 1.2],
['IMG', 'a_scen_2', 'R5ASIA', 'Emissions|CO2', 'Mt CO2/yr', 3.4, 8.4],
['IMG', 'a_scen_2', 'R5ASIA', 'Emissions|CO2|Cars', 'Mt CO2/yr', 1.2, 3.2],
['IMG', 'a_scen_2', 'R5ASIA', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2.2, 5.2],
['IMG', 'a_scen_2', 'R5REF', 'Primary Energy', 'EJ/y', 0.7, 1.0],
['IMG', 'a_scen_2', 'R5REF', 'Primary Energy|Coal', 'EJ/y', 0.35, 0.6],
['IMG', 'a_scen_2', 'R5REF', 'Primary Energy|Gas', 'EJ/y', 0.35, 0.4],
['IMG', 'a_scen_2', 'R5REF', 'Emissions|CO2', 'Mt CO2/yr', 1.4, 1.8],
['IMG', 'a_scen_2', 'R5REF', 'Emissions|CO2|Cars', 'Mt CO2/yr', 0.8, 1.0],
['IMG', 'a_scen_2', 'R5REF', 'Emissions|CO2|Tar', 'Mt CO2/yr', 0.6, 0.8],
['IMG', 'a_scen_2', 'World', 'Primary Energy', 'EJ/y', 2.1, 7.4],
['IMG', 'a_scen_2', 'World', 'Primary Energy|Coal', 'EJ/y', 1.3, 5.8],
['IMG', 'a_scen_2', 'World', 'Primary Energy|Gas', 'EJ/y', 0.8, 1.6],
['IMG', 'a_scen_2', 'World', 'Emissions|CO2', 'Mt CO2/yr', 4.8, 10.2],
['IMG', 'a_scen_2', 'World', 'Emissions|CO2|Cars', 'Mt CO2/yr', 2.0, 4.2],
['IMG', 'a_scen_2', 'World', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2.8, 6.0],
mg_ascen + ['R5ASIA', 'Primary Energy', 'EJ/y', 0.8, 5.8],
mg_ascen + ['R5ASIA', 'Primary Energy|Coal', 'EJ/y', 0.65, 4.9],
mg_ascen + ['R5ASIA', 'Primary Energy|Gas', 'EJ/y', 0.15, 0.9],
mg_ascen + ['R5ASIA', 'Emissions|CO2', 'Mt CO2/yr', 2.8, 7.8],
mg_ascen + ['R5ASIA', 'Emissions|CO2|Cars', 'Mt CO2/yr', 0.9, 2.9],
mg_ascen + ['R5ASIA', 'Emissions|CO2|Tar', 'Mt CO2/yr', 1.9, 4.9],
mg_ascen + ['R5REF', 'Primary Energy', 'EJ/y', 0.1, 0.4],
mg_ascen + ['R5REF', 'Primary Energy|Coal', 'EJ/y', 0.05, 0.3],
mg_ascen + ['R5REF', 'Primary Energy|Gas', 'EJ/y', 0.05, 0.1],
mg_ascen + ['R5REF', 'Emissions|CO2', 'Mt CO2/yr', 0.8, 1.2],
mg_ascen + ['R5REF', 'Emissions|CO2|Cars', 'Mt CO2/yr', 0.5, 0.7],
mg_ascen + ['R5REF', 'Emissions|CO2|Tar', 'Mt CO2/yr', 0.3, 0.5],
mg_ascen + ['World', 'Primary Energy', 'EJ/y', 0.9, 6.2],
mg_ascen + ['World', 'Primary Energy|Coal', 'EJ/y', 0.7, 5.2],
mg_ascen + ['World', 'Primary Energy|Gas', 'EJ/y', 0.2, 1.0],
mg_ascen + ['World', 'Emissions|CO2', 'Mt CO2/yr', 3.6, 9.0],
mg_ascen + ['World', 'Emissions|CO2|Cars', 'Mt CO2/yr', 1.4, 3.6],
mg_ascen + ['World', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2.2, 5.4],
mg_ascen_2 + ['R5ASIA', 'Primary Energy', 'EJ/y', -1.4, -6.4],
mg_ascen_2 + ['R5ASIA', 'Primary Energy|Coal', 'EJ/y', -0.95, -5.2],
mg_ascen_2 + ['R5ASIA', 'Primary Energy|Gas', 'EJ/y', -0.45, -1.2],
mg_ascen_2 + ['R5ASIA', 'Emissions|CO2', 'Mt CO2/yr', -3.4, -8.4],
mg_ascen_2 + ['R5ASIA', 'Emissions|CO2|Cars', 'Mt CO2/yr', -1.2, -3.2],
mg_ascen_2 + ['R5ASIA', 'Emissions|CO2|Tar', 'Mt CO2/yr', -2.2, -5.2],
mg_ascen_2 + ['R5REF', 'Primary Energy', 'EJ/y', -0.7, -1.0],
mg_ascen_2 + ['R5REF', 'Primary Energy|Coal', 'EJ/y', -0.35, -0.6],
mg_ascen_2 + ['R5REF', 'Primary Energy|Gas', 'EJ/y', -0.35, -0.4],
mg_ascen_2 + ['R5REF', 'Emissions|CO2', 'Mt CO2/yr', -1.4, -1.8],
mg_ascen_2 + ['R5REF', 'Emissions|CO2|Cars', 'Mt CO2/yr', -0.8, -1.0],
mg_ascen_2 + ['R5REF', 'Emissions|CO2|Tar', 'Mt CO2/yr', -0.6, -0.8],
mg_ascen_2 + ['World', 'Primary Energy', 'EJ/y', -2.1, -7.4],
mg_ascen_2 + ['World', 'Primary Energy|Coal', 'EJ/y', -1.3, -5.8],
mg_ascen_2 + ['World', 'Primary Energy|Gas', 'EJ/y', -0.8, -1.6],
mg_ascen_2 + ['World', 'Emissions|CO2', 'Mt CO2/yr', -5.0, -10.6],
mg_ascen_2 + ['World', 'Emissions|CO2|Cars', 'Mt CO2/yr', -2.0, -4.2],
mg_ascen_2 + ['World', 'Emissions|CO2|Tar', 'Mt CO2/yr', -2.8, -6.0],
mg_ascen_2 + ['World', 'Emissions|CO2|Agg Agg', 'Mt CO2/yr', -0.2, -0.4],
mg_ascen_2 + ['World', 'Emissions|CF4', 'kt CF4/yr', 54, 56],
mg_ascen_2 + ['World', 'Emissions|C2F6', 'kt C2F6/yr', 32, 27],
mg_ascen_2 + ['World', 'Emissions|C2F6|Solvents', 'kt C2F6/yr', 30, 33],
mg_ascen_2 + ['World', 'Emissions|C2F6|Industry', 'kt C2F6/yr', 2, -6],
mg_ascen_2 + ['World', 'Emissions|CH4', 'Mt CH4/yr', 322, 217],
mg_ascen_2 + ['R5REF', 'Emissions|CH4', 'Mt CH4/yr', 30, 201],
mg_ascen_2 + ['R5ASIA', 'Emissions|CH4', 'Mt CH4/yr', 292, 16],
],
columns=['model', 'scenario', 'region', 'variable', 'unit', 2005, 2010],
)


CHECK_AGG_REGIONAL_DF = pd.DataFrame([
['AIM', 'cscen', 'World', 'Emissions|N2O', 'Mt N/yr', 1.8, 15.6],
['AIM', 'cscen', 'World', 'Emissions|N2O|Shipping', 'Mt N/yr', 1, 6],
['AIM', 'cscen', 'World', 'Emissions|N2O|Solvents', 'Mt N/yr', 1.6, 3.8],
['AIM', 'cscen', 'World', 'Emissions|N2O|Transport', 'Mt N/yr', -0.8, 5.8],
['AIM', 'cscen', 'RASIA', 'Emissions|N2O', 'Mt N/yr', 0, 5.9],
['AIM', 'cscen', 'RASIA', 'Emissions|N2O|Solvents', 'Mt N/yr', 0.8, 2.6],
['AIM', 'cscen', 'RASIA', 'Emissions|N2O|Transport', 'Mt N/yr', -0.8, 3.3],
['AIM', 'cscen', 'REUROPE', 'Emissions|N2O', 'Mt N/yr', 0.8, 3.7],
['AIM', 'cscen', 'REUROPE', 'Emissions|N2O|Solvents', 'Mt N/yr', 0.8, 1.2],
['AIM', 'cscen', 'REUROPE', 'Emissions|N2O|Transport', 'Mt N/yr', 0, 2.5],
['AIM', 'cscen', 'China', 'Emissions|N2O', 'Mt N/yr', 0.2, 1.3],
['AIM', 'cscen', 'China', 'Emissions|N2O|Transport', 'Mt N/yr', 0.2, 1.3],
['AIM', 'cscen', 'Japan', 'Emissions|N2O', 'Mt N/yr', -1, 2],
['AIM', 'cscen', 'Japan', 'Emissions|N2O|Transport', 'Mt N/yr', -1, 2],
['AIM', 'cscen', 'Germany', 'Emissions|N2O', 'Mt N/yr', 2, 3],
['AIM', 'cscen', 'Germany', 'Emissions|N2O|Transport', 'Mt N/yr', 2, 3],
['AIM', 'cscen', 'UK', 'Emissions|N2O', 'Mt N/yr', -2, -0.5],
['AIM', 'cscen', 'UK', 'Emissions|N2O|Transport', 'Mt N/yr', -2, -0.5],

],
columns=['model', 'scenario', 'region', 'variable', 'unit', 2005, 2010],
)


@pytest.fixture(scope="function")
def test_df():
df = IamDataFrame(data=TEST_DF.iloc[:2])
Expand All @@ -48,6 +161,18 @@ def meta_df():
yield df


@pytest.fixture(scope="function")
def check_aggregate_df():
df = IamDataFrame(data=CHECK_AGG_DF)
yield df


@pytest.fixture(scope="function")
def check_aggregate_regional_df():
df = IamDataFrame(data=CHECK_AGG_REGIONAL_DF)
yield df


@pytest.fixture(scope="function")
def reg_df():
df = IamDataFrame(data=REG_DF)
Expand Down
Loading

0 comments on commit 5e34dfc

Please sign in to comment.