diff --git a/momepy/functional/_diversity.py b/momepy/functional/_diversity.py index c092e362..bc144195 100644 --- a/momepy/functional/_diversity.py +++ b/momepy/functional/_diversity.py @@ -3,10 +3,13 @@ import numpy as np import pandas as pd from libpysal.graph import Graph +from libpysal.graph._utils import _percentile_filtration_grouper from numpy.typing import NDArray from packaging.version import Version from pandas import DataFrame, Series +from ..diversity import shannon_diversity, simpson_diversity + try: from numba import njit @@ -18,10 +21,21 @@ __all__ = [ "describe", "describe_reached", + "values_range", + "theil", + "simpson", + "shannon", + "gini", "percentile", ] +def _get_grouper(y, graph): + return y.take(graph._adjacency.index.codes[1]).groupby( + graph._adjacency.index.codes[0] + ) + + @njit def _interpolate(values, q): weights = values[:, 0] @@ -157,7 +171,7 @@ def describe( Notes ----- - The index of ``values`` must match the index along which the ``graph`` is + The index of ``y`` must match the index along which the ``graph`` is built. The numba package is used extensively in this function to accelerate the computation @@ -203,6 +217,306 @@ def describe( return _compute_stats(grouper, q, include_mode) +def values_range( + y: Series | NDArray[np.float64], graph: Graph, q: tuple | list = (0, 100) +) -> Series: + """Calculates the range of values within neighbours defined in ``graph``. + + Adapted from :cite:`dibble2017`. + + Notes + ----- + The index of ``y`` must match the index along which the ``graph`` is + built. + + Parameters + ---------- + data : Series + A DataFrame or Series containing the values to be analysed. + graph : libpysal.graph.Graph + A spatial weights matrix for the data. + q : tuple, list, optional (default (0,100))) + A two-element sequence containing floats between 0 and 100 (inclusive) + that are the percentiles over which to compute the range. + The order of the elements is not important. + + Returns + ---------- + Series + A Series containing resulting values. + + Examples + -------- + >>> tessellation_df['area_IQR_3steps'] = mm.range(tessellation_df['area'], + ... graph, + ... q=(25, 75)) + """ + + stats = percentile(y, graph, q=q) + return stats[max(q)] - stats[min(q)] + + +def theil(y: Series, graph: Graph, q: tuple | list | None = None) -> Series: + """Calculates the Theil measure of inequality of values within neighbours defined in + ``graph``. + + Uses ``inequality.theil.Theil`` under the hood. Requires '`inequality`' package. + + .. math:: + + T = \\sum_{i=1}^n \\left( + \\frac{y_i}{\\sum_{i=1}^n y_i} \\ln \\left[ + N \\frac{y_i} {\\sum_{i=1}^n y_i} + \\right] + \\right) + + Notes + ----- + The index of ``y`` must match the index along which the ``graph`` is + built. + + Parameters + ---------- + y : Series + A DataFrame or Series containing the values to be analysed. + graph : libpysal.graph.Graph + A spatial weights matrix for the data. + q : tuple, list, optional (default (0,100))) + A two-element sequence containing floats between 0 and 100 (inclusive) + that are the percentiles over which to compute the range. + The order of the elements is not important. + + Returns + ---------- + Series + A Series containing resulting values. + + Examples + -------- + >>> tessellation_df['area_Theil'] = mm.theil(tessellation_df['area'], + ... graph) + """ + + try: + from inequality.theil import Theil + except ImportError as err: + raise ImportError("The 'inequality' package is required.") from err + + if q: + grouper = _percentile_filtration_grouper(y, graph._adjacency.index, q=q) + else: + grouper = _get_grouper(y, graph) + + result = grouper.apply(lambda x: Theil(x.values).T) + result.index = graph.unique_ids + return result + + +def simpson( + y: Series, + graph: Graph, + binning: str = "HeadTailBreaks", + gini_simpson: bool = False, + inverse: bool = False, + categorical: bool = False, + **classification_kwds, +) -> Series: + """Calculates the Simpson's diversity index of values within neighbours defined in + ``graph``. + Uses ``mapclassify.classifiers`` under the hood for binning. + Requires ``mapclassify>=.2.1.0`` dependency. + + .. math:: + + \\lambda=\\sum_{i=1}^{R} p_{i}^{2} + + Adapted from :cite:`feliciotti2018`. + + Notes + ----- + The index of ``y`` must match the index along which the ``graph`` is + built. + + Parameters + ---------- + y : Series + A DataFrame or Series containing the values to be analysed. + graph : libpysal.graph.Graph + A spatial weights matrix for the data. + binning : str (default 'HeadTailBreaks') + One of mapclassify classification schemes. For details see + `mapclassify API documentation `_. + gini_simpson : bool (default False) + Return Gini-Simpson index instead of Simpson index (``1 - λ``). + inverse : bool (default False) + Return Inverse Simpson index instead of Simpson index (``1 / λ``). + categorical : bool (default False) + Treat values as categories (will not use ``binning``). + **classification_kwds : dict + Keyword arguments for the classification scheme. + For details see `mapclassify documentation `_. + + Returns + ------- + Series + A Series containing resulting values. + + Examples + -------- + >>> tessellation_df['area_Simpson'] = mm.simpson(tessellation_df['area'], + ... graph) + + See also + -------- + momepy.simpson_diversity : Calculates the Simpson's diversity index of data. + """ + if not categorical: + try: + from mapclassify import classify + except ImportError as err: + raise ImportError( + "The 'mapclassify >= 2.4.2` package is required." + ) from err + bins = classify(y, scheme=binning, **classification_kwds).bins + else: + bins = None + + def _apply_simpson_diversity(values): + return simpson_diversity( + values, + bins, + categorical=categorical, + ) + + result = graph.apply(y, _apply_simpson_diversity) + + if gini_simpson: + result = 1 - result + elif inverse: + result = 1 / result + return result + + +def shannon( + y: Series, + graph: Graph, + binning: str = "HeadTailBreaks", + categorical: bool = False, + categories: list | None = None, + **classification_kwds, +) -> Series: + """Calculates the Shannon index of values within neighbours defined in + ``graph``. + Uses ``mapclassify.classifiers`` under the hood + for binning. Requires ``mapclassify>=.2.1.0`` dependency. + + .. math:: + + H^{\\prime}=-\\sum_{i=1}^{R} p_{i} \\ln p_{i} + + Notes + ----- + The index of ``y`` must match the index along which the ``graph`` is + built. + + Parameters + ---------- + y : Series + A DataFrame or Series containing the values to be analysed. + graph : libpysal.graph.Graph + A spatial weights matrix for the data. + binning : str (default 'HeadTailBreaks') + One of mapclassify classification schemes. For details see + `mapclassify API documentation `_. + categorical : bool (default False) + Treat values as categories (will not use binning). + categories : list-like (default None) + A list of categories. If ``None``, ``values.unique()`` is used. + **classification_kwds : dict + Keyword arguments for classification scheme + For details see `mapclassify documentation `_. + + Returns + ---------- + Series + A Series containing resulting values. + + Examples + -------- + >>> tessellation_df['area_Shannon'] = mm.shannon(tessellation_df['area'], + ... graph) + """ + + if not categories: + categories = y.unique() + + if not categorical: + try: + from mapclassify import classify + except ImportError as err: + raise ImportError( + "The 'mapclassify >= 2.4.2` package is required." + ) from err + bins = classify(y, scheme=binning, **classification_kwds).bins + else: + bins = categories + + def _apply_shannon(values): + return shannon_diversity(values, bins, categorical, categories) + + return graph.apply(y, _apply_shannon) + + +def gini(y: Series, graph: Graph, q: tuple | list | None = None) -> Series: + """Calculates the Gini index of values within neighbours defined in ``graph``. + Uses ``inequality.gini.Gini`` under the hood. Requires '`inequality`' package. + + Notes + ----- + The index of ``y`` must match the index along which the ``graph`` is + built. + + Parameters + ---------- + y : Series + A DataFrame or Series containing the values to be analysed. + graph : libpysal.graph.Graph + A spatial weights matrix for the data. + q : tuple, list, optional (default (0,100))) + A two-element sequence containing floats between 0 and 100 (inclusive) + that are the percentiles over which to compute the range. + The order of the elements is not important. + + Returns + ---------- + Series + A Series containing resulting values. + + Examples + -------- + >>> tessellation_df['area_Gini'] = mm.gini(tessellation_df['area'], + ... graph) + """ + try: + from inequality.gini import Gini + except ImportError as err: + raise ImportError("The 'inequality' package is required.") from err + + if y.min() < 0: + raise ValueError( + "Values contain negative numbers. Normalise data before" + "using momepy.Gini." + ) + if q: + grouper = _percentile_filtration_grouper(y, graph._adjacency.index, q=q) + else: + grouper = _get_grouper(y, graph) + + result = grouper.apply(lambda x: Gini(x.values).g) + result.index = graph.unique_ids + return result + + def describe_reached( y: np.ndarray | Series, graph_index: np.ndarray | Series, diff --git a/momepy/functional/tests/test_diversity.py b/momepy/functional/tests/test_diversity.py index da8d687c..007fe781 100644 --- a/momepy/functional/tests/test_diversity.py +++ b/momepy/functional/tests/test_diversity.py @@ -20,6 +20,8 @@ class TestDescribe: def setup_method(self): test_file_path = mm.datasets.get_path("bubenec") self.df_buildings = gpd.read_file(test_file_path, layer="buildings") + self.df_tessellation = gpd.read_file(test_file_path, layer="tessellation") + self.df_tessellation["area"] = self.df_tessellation.geometry.area self.df_streets = gpd.read_file(test_file_path, layer="streets") self.df_tessellation = gpd.read_file(test_file_path, layer="tessellation") self.df_streets["nID"] = mm.unique_id(self.df_streets) @@ -66,6 +68,12 @@ def _distance_decay_weights(group): self.decay_graph = graph.transform(_distance_decay_weights) + self.diversity_graph = ( + Graph.build_contiguity(self.df_tessellation) + .higher_order(k=3, lower_order=True) + .assign_self_weight() + ) + def test_describe(self): area = self.df_buildings.area r = mm.describe(area, self.graph) @@ -165,6 +173,217 @@ def test_describe_array(self): assert_frame_equal(r, r2) + def test_values_range(self): + full_sw = mm.values_range(self.df_tessellation["area"], self.diversity_graph) + full_sw_expected = { + "count": 144, + "mean": 13575.258680748986, + "min": 3789.0228732928035, + "max": 34510.77694161156, + } + assert_result( + full_sw, full_sw_expected, self.df_tessellation, check_names=False + ) + + limit = mm.values_range( + self.df_tessellation["area"], self.diversity_graph, q=(10, 90) + ) + limit_expected = { + "mean": 3551.9379326637954, + "max": 6194.978308458511, + "min": 2113.282481158694, + "count": 144, + } + + assert_result(limit, limit_expected, self.df_tessellation, check_names=False) + + def test_theil(self): + full_sw = mm.theil(self.df_tessellation["area"], self.diversity_graph) + full_sw2 = mm.theil( + self.df_tessellation["area"], self.diversity_graph, q=(0, 100) + ) + full_sw_expected = { + "count": 144, + "mean": 0.3367193709036915, + "min": 0.0935437083870931, + "max": 1.0063687846141105, + } + assert_result( + full_sw, full_sw_expected, self.df_tessellation, check_names=False + ) + assert_result( + full_sw2, full_sw_expected, self.df_tessellation, check_names=False + ) + + # mismatch between percentile interpolation methods + limit = mm.theil(self.df_tessellation["area"], self.diversity_graph, q=(10, 90)) + limit_expected = { + "count": 144, + "mean": 0.09689345872019642, + "min": 0.03089398223055910, + "max": 0.2726670141461655, + } + + assert_result(limit, limit_expected, self.df_tessellation, check_names=False) + + zeros = mm.theil( + pd.Series(np.zeros(len(self.df_tessellation)), self.df_tessellation.index), + self.graph, + ) + zeros_expected = {"count": 144, "mean": 0, "min": 0, "max": 0.0} + assert_result(zeros, zeros_expected, self.df_tessellation, check_names=False) + + def test_simpson(self): + ht_sw = mm.simpson(self.df_tessellation["area"], self.diversity_graph) + ht_sw_expected = { + "count": 144, + "mean": 0.5106343598245804, + "min": 0.3504, + "max": 0.7159183673469389, + } + assert_result(ht_sw, ht_sw_expected, self.df_tessellation, check_names=False) + + quan_sw = mm.simpson( + self.df_tessellation.area, self.diversity_graph, binning="quantiles", k=3 + ) + quan_sw_expected = { + "count": 144, + "mean": 0.36125200075406005, + "min": 0.3333333333333333, + "max": 0.4609375, + } + assert_result( + quan_sw, quan_sw_expected, self.df_tessellation, check_names=False + ) + + with pytest.raises(ValueError): + mm.simpson(self.df_tessellation.area, self.graph, binning="nonexistent") + + gs = mm.simpson( + self.df_tessellation.area, self.diversity_graph, gini_simpson=True + ) + gs_expected = { + "count": 144, + "mean": 0.4893656401754196, + "min": 0.2840816326530611, + "max": 0.6496, + } + assert_result(gs, gs_expected, self.df_tessellation, check_names=False) + + gs_inv = mm.simpson( + self.df_tessellation.area, self.diversity_graph, inverse=True + ) + gs_inv_expected = { + "count": 144, + "mean": 1.994951794685094, + "min": 1.3968072976054728, + "max": 2.853881278538813, + } + assert_result(gs_inv, gs_inv_expected, self.df_tessellation, check_names=False) + + self.df_tessellation["cat"] = list(range(8)) * 18 + cat = mm.simpson( + self.df_tessellation.cat, self.diversity_graph, categorical=True + ) + cat_expected = { + "count": 144, + "mean": 0.13227361237314683, + "min": 0.1255205234979179, + "max": 0.15625, + } + assert_result(cat, cat_expected, self.df_tessellation, check_names=False) + + def test_gini(self): + with pytest.raises(ValueError): + mm.gini(pd.Series(-1, self.df_tessellation.index), self.diversity_graph) + + full_sw = mm.gini(self.df_tessellation["area"], self.diversity_graph) + full_sw_expected = { + "count": 144, + "mean": 0.38686076469743697, + "min": 0.24235274498955336, + "max": 0.6400687910616315, + } + assert_result( + full_sw, full_sw_expected, self.df_tessellation, check_names=False + ) + + # mismatch between interpolation methods + limit = mm.gini(self.df_tessellation["area"], self.diversity_graph, q=(10, 90)) + limit_expected = { + "count": 144, + "mean": 0.2417437064941186, + "min": 0.14098983070917345, + "max": 0.3978182288393458, + } + assert_result(limit, limit_expected, self.df_tessellation, check_names=False) + + def test_shannon(self): + with pytest.raises(ValueError): + mm.shannon( + self.df_tessellation.area, self.diversity_graph, binning="nonexistent" + ) + + ht_sw = mm.shannon(self.df_tessellation["area"], self.diversity_graph) + ht_sw_expected = { + "count": 144, + "mean": 0.8290031127861055, + "min": 0.4581441790615257, + "max": 1.1626998334975678, + } + assert_result(ht_sw, ht_sw_expected, self.df_tessellation, check_names=False) + + quan_sw = mm.shannon( + self.df_tessellation["area"], self.diversity_graph, binning="quantiles", k=3 + ) + quan_sw_expected = { + "count": 144, + "mean": 1.0543108593712356, + "min": 0.8647400965276372, + "max": 1.0986122886681096, + } + assert_result( + quan_sw, quan_sw_expected, self.df_tessellation, check_names=False + ) + + self.df_tessellation["cat"] = list(range(8)) * 18 + cat = mm.shannon( + self.df_tessellation.cat, self.diversity_graph, categorical=True + ) + cat_expected = { + "count": 144, + "mean": 2.0493812749063793, + "min": 1.9561874676604514, + "max": 2.0774529508369457, + } + assert_result(cat, cat_expected, self.df_tessellation, check_names=False) + + def test_unique(self): + self.df_tessellation["cat"] = list(range(8)) * 18 + un = self.diversity_graph.describe( + self.df_tessellation["cat"], statistics=["nunique"] + )["nunique"] + un_expected = {"count": 144, "mean": 8.0, "min": 8, "max": 8} + assert_result(un, un_expected, self.df_tessellation, check_names=False) + + self.df_tessellation.loc[0, "cat"] = np.nan + + un_nan_drop = self.diversity_graph.describe( + self.df_tessellation["cat"], statistics=["nunique"] + )["nunique"] + un_nan_drop_expected = {"count": 144, "mean": 8.0, "min": 8, "max": 8} + assert_result( + un_nan_drop, un_nan_drop_expected, self.df_tessellation, check_names=False + ) + + # to count nas you have to explicitly process them npw + self.df_tessellation.loc[self.df_tessellation["cat"].isna(), "cat"] = "np.nan" + un_nan = self.diversity_graph.describe( + self.df_tessellation["cat"], statistics=["nunique"] + )["nunique"] + un_nan_expected = {"count": 144, "mean": 8.13888888888889, "min": 8, "max": 9} + assert_result(un_nan, un_nan_expected, self.df_tessellation, check_names=False) + @pytest.mark.skipif( not PD_210, reason="aggregation is different in previous pandas versions" ) @@ -484,6 +703,14 @@ def _distance_decay_weights(group): self.decay_graph = graph.transform(_distance_decay_weights) + # for diversity tests + self.sw = mm.sw_high(k=3, gdf=self.df_tessellation, ids="uID") + self.graph_diversity = ( + Graph.build_contiguity(self.df_tessellation) + .higher_order(k=3, lower_order=True) + .assign_self_weight() + ) + @pytest.mark.skipif( not PD_210, reason="aggregation is different in previous pandas versions" ) @@ -534,6 +761,180 @@ def test_describe_reached_equality_sw(self): new_fl_area, old_fl_area, check_names=False, check_dtype=False ) + def test_values_range(self): + full_sw_new = mm.values_range( + self.df_tessellation["area"], self.graph_diversity + ) + full_sw_old = mm.Range(self.df_tessellation, "area", self.sw, "uID").series + assert_series_equal( + full_sw_new, full_sw_old, check_dtype=False, check_names=False + ) + + limit_new = mm.values_range( + self.df_tessellation["area"], self.graph_diversity, q=(10, 90) + ) + limit_old = mm.Range( + self.df_tessellation, + "area", + self.sw, + "uID", + interpolation="hazen", + rng=(10, 90), + ).series + assert_series_equal(limit_new, limit_old, check_dtype=False, check_names=False) + + def test_theil(self): + full_sw_new = mm.theil(self.df_tessellation["area"], self.graph_diversity) + full_sw_old = mm.Theil(self.df_tessellation, "area", self.sw, "uID").series + assert_series_equal( + full_sw_new, full_sw_old, check_dtype=False, check_names=False + ) + + # old and new have different percentile interpolation methods + # therefore the comparison needs a higher rtol + limit_new = mm.theil( + self.df_tessellation["area"], self.graph_diversity, q=(10, 90) + ) + limit_old = mm.Theil( + self.df_tessellation, + self.df_tessellation.area, + self.sw, + "uID", + rng=(10, 90), + ).series + assert_series_equal( + limit_new, limit_old, rtol=0.5, check_dtype=False, check_names=False + ) + + zeros_new = mm.theil( + pd.Series(np.zeros(len(self.df_tessellation)), self.df_tessellation.index), + self.graph_diversity, + ) + zeros_old = mm.Theil( + self.df_tessellation, np.zeros(len(self.df_tessellation)), self.sw, "uID" + ).series + assert_series_equal(zeros_new, zeros_old, check_dtype=False, check_names=False) + + def test_simpson(self): + ht_sw_new = mm.simpson(self.df_tessellation["area"], self.graph_diversity) + ht_sw_old = mm.Simpson(self.df_tessellation, "area", self.sw, "uID").series + assert_series_equal(ht_sw_new, ht_sw_old, check_dtype=False, check_names=False) + + quan_sw_new = mm.simpson( + self.df_tessellation.area, self.graph_diversity, binning="quantiles", k=3 + ) + quan_sw_old = mm.Simpson( + self.df_tessellation, + self.df_tessellation.area, + self.sw, + "uID", + binning="quantiles", + k=3, + ).series + assert_series_equal( + quan_sw_new, quan_sw_old, check_dtype=False, check_names=False + ) + + gs_new = mm.simpson( + self.df_tessellation.area, self.graph_diversity, gini_simpson=True + ) + gs_old = mm.Simpson( + self.df_tessellation, "area", self.sw, "uID", gini_simpson=True + ).series + assert_series_equal(gs_new, gs_old, check_dtype=False, check_names=False) + + gs_new = mm.simpson( + self.df_tessellation.area, self.graph_diversity, inverse=True + ) + gs_old = mm.Simpson( + self.df_tessellation, "area", self.sw, "uID", inverse=True + ).series + assert_series_equal(gs_new, gs_old, check_dtype=False, check_names=False) + + self.df_tessellation["cat"] = list(range(8)) * 18 + cat_new = mm.simpson( + self.df_tessellation.cat, self.graph_diversity, categorical=True + ) + cat_old = mm.Simpson( + self.df_tessellation, "cat", self.sw, "uID", categorical=True + ).series + assert_series_equal(cat_new, cat_old, check_dtype=False, check_names=False) + + def test_gini(self): + full_sw_new = mm.gini(self.df_tessellation["area"], self.graph_diversity) + full_sw_old = mm.Gini(self.df_tessellation, "area", self.sw, "uID").series + assert_series_equal( + full_sw_new, full_sw_old, check_dtype=False, check_names=False + ) + + # ## old and new have different interpolation methods + ## there need higher rtol + limit_new = mm.gini( + self.df_tessellation["area"], self.graph_diversity, q=(10, 90) + ) + limit_old = mm.Gini( + self.df_tessellation, "area", self.sw, "uID", rng=(10, 90) + ).series + assert_series_equal( + limit_new, limit_old, rtol=0.3, check_dtype=False, check_names=False + ) + + def test_shannon(self): + ht_sw_new = mm.shannon(self.df_tessellation["area"], self.graph_diversity) + ht_sw_old = mm.Shannon(self.df_tessellation, "area", self.sw, "uID").series + assert_series_equal(ht_sw_new, ht_sw_old, check_dtype=False, check_names=False) + + quan_sw_new = mm.shannon( + self.df_tessellation["area"], self.graph_diversity, binning="quantiles", k=3 + ) + quan_sw_old = mm.Shannon( + self.df_tessellation, + self.df_tessellation.area, + self.sw, + "uID", + binning="quantiles", + k=3, + ).series + assert_series_equal( + quan_sw_new, quan_sw_old, check_dtype=False, check_names=False + ) + + self.df_tessellation["cat"] = list(range(8)) * 18 + cat_new = mm.shannon( + self.df_tessellation.cat, self.graph_diversity, categorical=True + ) + cat_old = mm.Shannon( + self.df_tessellation, "cat", self.sw, "uID", categorical=True + ).series + assert_series_equal(cat_new, cat_old, check_dtype=False, check_names=False) + + def test_unique(self): + self.df_tessellation["cat"] = list(range(8)) * 18 + un_new = self.graph_diversity.describe( + self.df_tessellation["cat"], statistics=["nunique"] + )["nunique"] + un_old = mm.Unique(self.df_tessellation, "cat", self.sw, "uID").series + assert_series_equal(un_new, un_old, check_dtype=False, check_names=False) + + self.df_tessellation.loc[0, "cat"] = np.nan + un_new = self.graph_diversity.describe( + self.df_tessellation["cat"], statistics=["nunique"] + )["nunique"] + un_old = mm.Unique( + self.df_tessellation, "cat", self.sw, "uID", dropna=True + ).series + assert_series_equal(un_new, un_old, check_dtype=False, check_names=False) + + # to keep NAs you ahve to explicitly process them now + self.df_tessellation.loc[self.df_tessellation["cat"].isna(), "cat"] = "np.nan" + un_new = self.graph_diversity.describe( + self.df_tessellation["cat"], statistics=["nunique"] + )["nunique"] + un_old = mm.Unique( + self.df_tessellation, "cat", self.sw, "uID", dropna=False + ).series + assert_series_equal(un_new, un_old, check_dtype=False, check_names=False) + def test_unweighted_percentile(self): sw = mm.sw_high(k=3, gdf=self.df_tessellation, ids="uID") graph = (