Add region sum check (#88)

* Add failing tests of database checking * Pass failing tests of database checking Update tutorials a bit too Ready to give to Matt and Dan for first look * Update based on conversations * Add release note * Remove World region hardcoding * Update tutorial notebook * Update docstrings * Clean up linting errors * Attempt to pass stickler
IAMconsortium · Oct 15, 2018 · 5e34dfc · 5e34dfc
1 parent 6a5b600
commit 5e34dfc
Show file tree

Hide file tree

Showing 8 changed files with 5,353 additions and 17 deletions.
diff --git a/.gitignore b/.gitignore
@@ -110,3 +110,6 @@ ENV/
 
 # mypy
 .mypy_cache/
+
+# pytest
+.pytest_cache/
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -9,6 +9,7 @@
 - (#93)[https://github.com/IAMconsortium/pyam/pull/93] IamDataFrame can be initilialzed from pd.DataFrame with index
 - (#92)[https://github.com/IAMconsortium/pyam/pull/92] Adding `$` to the pseudo-regexp syntax in `pattern_match()`, adds override option
 - (#90)[https://github.com/IAMconsortium/pyam/pull/90] Adding a function to `set_panel_label()` as part of the plotting library
+- (#88)[https://github.com/IAMconsortium/pyam/pull/88] Adding `check_aggregate_regions` and `check_internal_consistency` to help with database validation, especially for emissions users
 - (#87)[https://github.com/IAMconsortium/pyam/pull/87] Extending `rename()` to work with model and scenario names
 - (#85)[https://github.com/IAMconsortium/pyam/pull/85] Improved functionality for importing metadata and bugfix for filtering for strings if `nan` values exist in metadata
 - (#83)[https://github.com/IAMconsortium/pyam/pull/83] Extending `filter_by_meta()` to work with non-matching indices between `df` and `data

diff --git a/pyam/core.py b/pyam/core.py
@@ -31,6 +31,7 @@
     cast_years_to_int,
     META_IDX,
     YEAR_IDX,
+    REGION_IDX,
     IAMC_IDX,
     SORT_IDX,
     LONG_IDX,
@@ -516,7 +517,13 @@ def check_aggregate(self, variable, components=None, units=None,
             components = self.filter(variable='{}|*'.format(variable),
                                      level=0).variables()
 
-        # filter and groupby data, use `pd.Series.align` for machting index
+        if not len(components):
+            msg = '{} - cannot check aggregate because it has no components'
+            logger().info(msg.format(variable))
+
+            return
+
+        # filter and groupby data, use `pd.Series.align` for matching index
         df_variable, df_components = (
             _aggregate_by_variables(self.data, variable, units)
             .align(_aggregate_by_variables(self.data, components, units))
@@ -527,14 +534,133 @@ def check_aggregate(self, variable, components=None, units=None,
                                        **kwargs)]
 
         if len(diff):
-            msg = '{} of {} data points are not aggregates of components'
-            logger().info(msg.format(len(diff), len(df_variable)))
+            msg = '{} - {} of {} data points are not aggregates of components'
+            logger().info(msg.format(variable, len(diff), len(df_variable)))
+
+            if exclude_on_fail:
+                self._exclude_on_fail(diff.index.droplevel([2, 3]))
+
+            diff = pd.concat([diff], keys=[variable], names=['variable'])
+
+            return diff.unstack().rename_axis(None, axis=1)
+
+    def check_aggregate_regions(self, variable, region='World',
+                                components=None, units=None,
+                                exclude_on_fail=False, **kwargs):
+        """Check whether the region timeseries data match the aggregation
+        of components
+
+        Parameters
+        ----------
+        variable: str
+            variable to be checked for matching aggregation of components data
+        region: str
+            region to be checked for matching aggregation of components data
+        components: list of str, default None
+            list of regions, defaults to all regions except region
+        units: str or list of str, default None
+            filter variable and components for given unit(s)
+        exclude_on_fail: boolean, default False
+            flag scenarios failing validation as `exclude: True`
+        kwargs: passed to `np.isclose()`
+        """
+        var_df = self.filter(variable=variable, level=0)
+
+        if components is None:
+            components = var_df.filter(region=region, keep=False).regions()
+
+        if not len(components):
+            msg = (
+                '{} - cannot check regional aggregate because it has no '
+                'regional components'
+            )
+            logger().info(msg.format(variable))
+
+            return None
+
+        # filter and groupby data, use `pd.Series.align` for matching index
+        df_region, df_components = (
+            _aggregate_by_regions(var_df.data, region, units)
+            .align(_aggregate_by_regions(var_df.data, components, units))
+        )
+
+        df_components.index = df_components.index.droplevel(
+            "variable"
+        )
+
+        # Add in variables that are included in region totals but which
+        # aren't included in the regional components.
+        # For example, if we are looking at World and Emissions|BC, we need
+        # to add aviation and shipping to the sum of Emissions|BC for each
+        # of World's regional components to do a valid check.
+        different_region = components[0]
+        variable_components = self.filter(
+            variable="{}|*".format(variable)
+        ).variables()
+        for var_to_add in variable_components:
+            var_rows = self.data.variable == var_to_add
+            region_rows = self.data.region == different_region
+            var_has_regional_info = (var_rows & region_rows).any()
+            if not var_has_regional_info:
+                df_var_to_add = self.filter(
+                    region=region, variable=var_to_add
+                ).data.groupby(REGION_IDX).sum()['value']
+                df_var_to_add.index = df_var_to_add.index.droplevel("variable")
+
+                if len(df_var_to_add):
+                    df_components = df_components.add(df_var_to_add,
+                                                      fill_value=0)
+
+        df_components = pd.concat([df_components], keys=[variable],
+                                  names=['variable'])
+
+        # use `np.isclose` for checking match
+        diff = df_region[~np.isclose(df_region, df_components, **kwargs)]
+
+        if len(diff):
+            msg = (
+                '{} - {} of {} data points are not aggregates of regional '
+                'components'
+            )
+            logger().info(msg.format(variable, len(diff), len(df_region)))
 
             if exclude_on_fail:
                 self._exclude_on_fail(diff.index.droplevel([2, 3]))
 
+            diff = pd.concat([diff], keys=[region], names=['region'])
+
             return diff.unstack().rename_axis(None, axis=1)
 
+    def check_internal_consistency(self, **kwargs):
+        """Check whether the database is internally consistent
+
+        We check that all variables are equal to the sum of their sectoral
+        components and that all the regions add up to the World total. If
+        the check is passed, None is returned, otherwise a dictionary of
+        inconsistent variables is returned.
+
+        Note: at the moment, this method's regional checking is limited to
+        checking that all the regions sum to the World region. We cannot
+        make this more automatic unless we start to store how the regions
+        relate, see
+        [this issue](https://github.com/IAMconsortium/pyam/issues/106).
+
+        Parameters
+        ----------
+        kwargs: passed to `np.isclose()`
+        """
+        inconsistent_vars = {}
+        for variable in self.variables():
+            diff_agg = self.check_aggregate(variable, **kwargs)
+            if diff_agg is not None:
+                inconsistent_vars[variable + "-aggregate"] = diff_agg
+
+            diff_regional = self.check_aggregate_regions(variable, **kwargs)
+            if diff_regional is not None:
+                inconsistent_vars[variable + "-regional"] = diff_regional
+
+        return inconsistent_vars if inconsistent_vars else None
+
     def _exclude_on_fail(self, df):
         """Assign a selection of scenarios as `exclude: True` in meta"""
         idx = df if isinstance(df, pd.MultiIndex) else _meta_idx(df)
@@ -880,6 +1006,17 @@ def _aggregate_by_variables(df, variables, units=None):
     return df.groupby(YEAR_IDX).sum()['value']
 
 
+def _aggregate_by_regions(df, regions, units=None):
+    regions = [regions] if isstr(regions) else regions
+    df = df[df.region.isin(regions)]
+
+    if units is not None:
+        units = [units] if isstr(units) else units
+        df = df[df.unit.isin(units)]
+
+    return df.groupby(REGION_IDX).sum()['value']
+
+
 def _apply_filters(data, meta, filters):
     """Applies filters to the data and meta tables of an IamDataFrame.
 

diff --git a/pyam/utils.py b/pyam/utils.py
@@ -29,6 +29,7 @@
 # common indicies
 META_IDX = ['model', 'scenario']
 YEAR_IDX = ['model', 'scenario', 'region', 'year']
+REGION_IDX = ['model', 'scenario', 'variable', 'year']
 IAMC_IDX = ['model', 'scenario', 'region', 'variable', 'unit']
 SORT_IDX = ['model', 'scenario', 'variable', 'year', 'region']
 LONG_IDX = IAMC_IDX + ['year']

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -31,6 +31,119 @@
 )
 
 
+mg_ascen = ['MSG-GLB', 'a_scen']
+mg_ascen_2 = ['MSG-GLB', 'a_scen_2']
+CHECK_AGG_DF = pd.DataFrame([
+    ['IMG', 'a_scen', 'R5ASIA', 'Primary Energy', 'EJ/y', 1, 6],
+    ['IMG', 'a_scen', 'R5ASIA', 'Primary Energy|Coal', 'EJ/y', 0.75, 5],
+    ['IMG', 'a_scen', 'R5ASIA', 'Primary Energy|Gas', 'EJ/y', 0.25, 1],
+    ['IMG', 'a_scen', 'R5ASIA', 'Emissions|CO2', 'Mt CO2/yr', 3, 8],
+    ['IMG', 'a_scen', 'R5ASIA', 'Emissions|CO2|Cars', 'Mt CO2/yr', 1, 3],
+    ['IMG', 'a_scen', 'R5ASIA', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2, 5],
+    ['IMG', 'a_scen', 'R5REF', 'Primary Energy', 'EJ/y', 0.3, 0.6],
+    ['IMG', 'a_scen', 'R5REF', 'Primary Energy|Coal', 'EJ/y', 0.15, 0.4],
+    ['IMG', 'a_scen', 'R5REF', 'Primary Energy|Gas', 'EJ/y', 0.15, 0.2],
+    ['IMG', 'a_scen', 'R5REF', 'Emissions|CO2', 'Mt CO2/yr', 1, 1.4],
+    ['IMG', 'a_scen', 'R5REF', 'Emissions|CO2|Cars', 'Mt CO2/yr', 0.6, 0.8],
+    ['IMG', 'a_scen', 'R5REF', 'Emissions|CO2|Tar', 'Mt CO2/yr', 0.4, 0.6],
+    ['IMG', 'a_scen', 'World', 'Primary Energy', 'EJ/y', 1.3, 6.6],
+    ['IMG', 'a_scen', 'World', 'Primary Energy|Coal', 'EJ/y', 0.9, 5.4],
+    ['IMG', 'a_scen', 'World', 'Primary Energy|Gas', 'EJ/y', 0.4, 1.2],
+    ['IMG', 'a_scen', 'World', 'Emissions|CO2', 'Mt CO2/yr', 4, 9.4],
+    ['IMG', 'a_scen', 'World', 'Emissions|CO2|Cars', 'Mt CO2/yr', 1.6, 3.8],
+    ['IMG', 'a_scen', 'World', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2.4, 5.6],
+    ['IMG', 'a_scen_2', 'R5ASIA', 'Primary Energy', 'EJ/y', 1.4, 6.4],
+    ['IMG', 'a_scen_2', 'R5ASIA', 'Primary Energy|Coal', 'EJ/y', 0.95, 5.2],
+    ['IMG', 'a_scen_2', 'R5ASIA', 'Primary Energy|Gas', 'EJ/y', 0.45, 1.2],
+    ['IMG', 'a_scen_2', 'R5ASIA', 'Emissions|CO2', 'Mt CO2/yr', 3.4, 8.4],
+    ['IMG', 'a_scen_2', 'R5ASIA', 'Emissions|CO2|Cars', 'Mt CO2/yr', 1.2, 3.2],
+    ['IMG', 'a_scen_2', 'R5ASIA', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2.2, 5.2],
+    ['IMG', 'a_scen_2', 'R5REF', 'Primary Energy', 'EJ/y', 0.7, 1.0],
+    ['IMG', 'a_scen_2', 'R5REF', 'Primary Energy|Coal', 'EJ/y', 0.35, 0.6],
+    ['IMG', 'a_scen_2', 'R5REF', 'Primary Energy|Gas', 'EJ/y', 0.35, 0.4],
+    ['IMG', 'a_scen_2', 'R5REF', 'Emissions|CO2', 'Mt CO2/yr', 1.4, 1.8],
+    ['IMG', 'a_scen_2', 'R5REF', 'Emissions|CO2|Cars', 'Mt CO2/yr', 0.8, 1.0],
+    ['IMG', 'a_scen_2', 'R5REF', 'Emissions|CO2|Tar', 'Mt CO2/yr', 0.6, 0.8],
+    ['IMG', 'a_scen_2', 'World', 'Primary Energy', 'EJ/y', 2.1, 7.4],
+    ['IMG', 'a_scen_2', 'World', 'Primary Energy|Coal', 'EJ/y', 1.3, 5.8],
+    ['IMG', 'a_scen_2', 'World', 'Primary Energy|Gas', 'EJ/y', 0.8, 1.6],
+    ['IMG', 'a_scen_2', 'World', 'Emissions|CO2', 'Mt CO2/yr', 4.8, 10.2],
+    ['IMG', 'a_scen_2', 'World', 'Emissions|CO2|Cars', 'Mt CO2/yr', 2.0, 4.2],
+    ['IMG', 'a_scen_2', 'World', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2.8, 6.0],
+    mg_ascen + ['R5ASIA', 'Primary Energy', 'EJ/y', 0.8, 5.8],
+    mg_ascen + ['R5ASIA', 'Primary Energy|Coal', 'EJ/y', 0.65, 4.9],
+    mg_ascen + ['R5ASIA', 'Primary Energy|Gas', 'EJ/y', 0.15, 0.9],
+    mg_ascen + ['R5ASIA', 'Emissions|CO2', 'Mt CO2/yr', 2.8, 7.8],
+    mg_ascen + ['R5ASIA', 'Emissions|CO2|Cars', 'Mt CO2/yr', 0.9, 2.9],
+    mg_ascen + ['R5ASIA', 'Emissions|CO2|Tar', 'Mt CO2/yr', 1.9, 4.9],
+    mg_ascen + ['R5REF', 'Primary Energy', 'EJ/y', 0.1, 0.4],
+    mg_ascen + ['R5REF', 'Primary Energy|Coal', 'EJ/y', 0.05, 0.3],
+    mg_ascen + ['R5REF', 'Primary Energy|Gas', 'EJ/y', 0.05, 0.1],
+    mg_ascen + ['R5REF', 'Emissions|CO2', 'Mt CO2/yr', 0.8, 1.2],
+    mg_ascen + ['R5REF', 'Emissions|CO2|Cars', 'Mt CO2/yr', 0.5, 0.7],
+    mg_ascen + ['R5REF', 'Emissions|CO2|Tar', 'Mt CO2/yr', 0.3, 0.5],
+    mg_ascen + ['World', 'Primary Energy', 'EJ/y', 0.9, 6.2],
+    mg_ascen + ['World', 'Primary Energy|Coal', 'EJ/y', 0.7, 5.2],
+    mg_ascen + ['World', 'Primary Energy|Gas', 'EJ/y', 0.2, 1.0],
+    mg_ascen + ['World', 'Emissions|CO2', 'Mt CO2/yr', 3.6, 9.0],
+    mg_ascen + ['World', 'Emissions|CO2|Cars', 'Mt CO2/yr', 1.4, 3.6],
+    mg_ascen + ['World', 'Emissions|CO2|Tar', 'Mt CO2/yr', 2.2, 5.4],
+    mg_ascen_2 + ['R5ASIA', 'Primary Energy', 'EJ/y', -1.4, -6.4],
+    mg_ascen_2 + ['R5ASIA', 'Primary Energy|Coal', 'EJ/y', -0.95, -5.2],
+    mg_ascen_2 + ['R5ASIA', 'Primary Energy|Gas', 'EJ/y', -0.45, -1.2],
+    mg_ascen_2 + ['R5ASIA', 'Emissions|CO2', 'Mt CO2/yr', -3.4, -8.4],
+    mg_ascen_2 + ['R5ASIA', 'Emissions|CO2|Cars', 'Mt CO2/yr', -1.2, -3.2],
+    mg_ascen_2 + ['R5ASIA', 'Emissions|CO2|Tar', 'Mt CO2/yr', -2.2, -5.2],
+    mg_ascen_2 + ['R5REF', 'Primary Energy', 'EJ/y', -0.7, -1.0],
+    mg_ascen_2 + ['R5REF', 'Primary Energy|Coal', 'EJ/y', -0.35, -0.6],
+    mg_ascen_2 + ['R5REF', 'Primary Energy|Gas', 'EJ/y', -0.35, -0.4],
+    mg_ascen_2 + ['R5REF', 'Emissions|CO2', 'Mt CO2/yr', -1.4, -1.8],
+    mg_ascen_2 + ['R5REF', 'Emissions|CO2|Cars', 'Mt CO2/yr', -0.8, -1.0],
+    mg_ascen_2 + ['R5REF', 'Emissions|CO2|Tar', 'Mt CO2/yr', -0.6, -0.8],
+    mg_ascen_2 + ['World', 'Primary Energy', 'EJ/y', -2.1, -7.4],
+    mg_ascen_2 + ['World', 'Primary Energy|Coal', 'EJ/y', -1.3, -5.8],
+    mg_ascen_2 + ['World', 'Primary Energy|Gas', 'EJ/y', -0.8, -1.6],
+    mg_ascen_2 + ['World', 'Emissions|CO2', 'Mt CO2/yr', -5.0, -10.6],
+    mg_ascen_2 + ['World', 'Emissions|CO2|Cars', 'Mt CO2/yr', -2.0, -4.2],
+    mg_ascen_2 + ['World', 'Emissions|CO2|Tar', 'Mt CO2/yr', -2.8, -6.0],
+    mg_ascen_2 + ['World', 'Emissions|CO2|Agg Agg', 'Mt CO2/yr', -0.2, -0.4],
+    mg_ascen_2 + ['World', 'Emissions|CF4', 'kt CF4/yr', 54, 56],
+    mg_ascen_2 + ['World', 'Emissions|C2F6', 'kt C2F6/yr', 32, 27],
+    mg_ascen_2 + ['World', 'Emissions|C2F6|Solvents', 'kt C2F6/yr', 30, 33],
+    mg_ascen_2 + ['World', 'Emissions|C2F6|Industry', 'kt C2F6/yr', 2, -6],
+    mg_ascen_2 + ['World', 'Emissions|CH4', 'Mt CH4/yr', 322, 217],
+    mg_ascen_2 + ['R5REF', 'Emissions|CH4', 'Mt CH4/yr', 30, 201],
+    mg_ascen_2 + ['R5ASIA', 'Emissions|CH4', 'Mt CH4/yr', 292, 16],
+],
+    columns=['model', 'scenario', 'region', 'variable', 'unit', 2005, 2010],
+)
+
+
+CHECK_AGG_REGIONAL_DF = pd.DataFrame([
+    ['AIM', 'cscen', 'World', 'Emissions|N2O', 'Mt N/yr', 1.8, 15.6],
+    ['AIM', 'cscen', 'World', 'Emissions|N2O|Shipping', 'Mt N/yr', 1, 6],
+    ['AIM', 'cscen', 'World', 'Emissions|N2O|Solvents', 'Mt N/yr', 1.6, 3.8],
+    ['AIM', 'cscen', 'World', 'Emissions|N2O|Transport', 'Mt N/yr', -0.8, 5.8],
+    ['AIM', 'cscen', 'RASIA', 'Emissions|N2O', 'Mt N/yr', 0, 5.9],
+    ['AIM', 'cscen', 'RASIA', 'Emissions|N2O|Solvents', 'Mt N/yr', 0.8, 2.6],
+    ['AIM', 'cscen', 'RASIA', 'Emissions|N2O|Transport', 'Mt N/yr', -0.8, 3.3],
+    ['AIM', 'cscen', 'REUROPE', 'Emissions|N2O', 'Mt N/yr', 0.8, 3.7],
+    ['AIM', 'cscen', 'REUROPE', 'Emissions|N2O|Solvents', 'Mt N/yr', 0.8, 1.2],
+    ['AIM', 'cscen', 'REUROPE', 'Emissions|N2O|Transport', 'Mt N/yr', 0, 2.5],
+    ['AIM', 'cscen', 'China', 'Emissions|N2O', 'Mt N/yr', 0.2, 1.3],
+    ['AIM', 'cscen', 'China', 'Emissions|N2O|Transport', 'Mt N/yr', 0.2, 1.3],
+    ['AIM', 'cscen', 'Japan', 'Emissions|N2O', 'Mt N/yr', -1, 2],
+    ['AIM', 'cscen', 'Japan', 'Emissions|N2O|Transport', 'Mt N/yr', -1, 2],
+    ['AIM', 'cscen', 'Germany', 'Emissions|N2O', 'Mt N/yr', 2, 3],
+    ['AIM', 'cscen', 'Germany', 'Emissions|N2O|Transport', 'Mt N/yr', 2, 3],
+    ['AIM', 'cscen', 'UK', 'Emissions|N2O', 'Mt N/yr', -2, -0.5],
+    ['AIM', 'cscen', 'UK', 'Emissions|N2O|Transport', 'Mt N/yr', -2, -0.5],
+
+],
+    columns=['model', 'scenario', 'region', 'variable', 'unit', 2005, 2010],
+)
+
+
 @pytest.fixture(scope="function")
 def test_df():
     df = IamDataFrame(data=TEST_DF.iloc[:2])
@@ -48,6 +161,18 @@ def meta_df():
     yield df
 
 
+@pytest.fixture(scope="function")
+def check_aggregate_df():
+    df = IamDataFrame(data=CHECK_AGG_DF)
+    yield df
+
+
+@pytest.fixture(scope="function")
+def check_aggregate_regional_df():
+    df = IamDataFrame(data=CHECK_AGG_REGIONAL_DF)
+    yield df
+
+
 @pytest.fixture(scope="function")
 def reg_df():
     df = IamDataFrame(data=REG_DF)