From 66ec9d1c94210d2afc0075349b5caf1082bb8637 Mon Sep 17 00:00:00 2001 From: Krasen Samardzhiev Date: Sat, 4 May 2024 20:23:33 +0200 Subject: [PATCH 1/4] functional diversity implementation --- momepy/functional/_diversity.py | 327 ++++++++++++++++++- momepy/functional/tests/test_diversity.py | 366 +++++++++++++++++++++- 2 files changed, 691 insertions(+), 2 deletions(-) diff --git a/momepy/functional/_diversity.py b/momepy/functional/_diversity.py index 5ee80bf9..8c9829b7 100644 --- a/momepy/functional/_diversity.py +++ b/momepy/functional/_diversity.py @@ -1,11 +1,14 @@ import warnings import numpy as np +import scipy as sp from libpysal.graph import Graph from numpy.typing import NDArray from pandas import DataFrame, Series -__all__ = ["describe"] +import momepy as mm + +__all__ = ["describe", "values_range", "theil", "simpson", "shannon", "gini", "unique"] def describe( @@ -117,3 +120,325 @@ def _describe(values, q, include_mode=False): stat_.columns = cols return stat_ + + +def values_range( + data: DataFrame | Series, graph: Graph, rng: tuple | list = (0, 100), **kwargs +): + """ + Calculates the range of values within neighbours defined in ``graph``. + Uses ``scipy.stats.iqr`` under the hood. + + Adapted from :cite:`dibble2017`. + + Parameters + ---------- + data : DataFrame | Series + A DataFrame or Series containing the values to be analysed. + graph : libpysal.graph.Graph + A spatial weights matrix for the data. + rng : tuple, list, optional (default (0,100))) + A two-element sequence containing floats between 0 and 100 (inclusive) + that are the percentiles over which to compute the range. + The order of the elements is not important. + **kwargs : dict + Optional arguments for ``scipy.stats.iqr``. + + Returns + ---------- + Series + A Series containing resulting values. + + Examples + -------- + >>> tessellation_df['area_IQR_3steps'] = mm.range(tessellation_df['area'], + ... graph, + ... rng=(25, 75)) + """ + + def _apply_range(values): + return sp.stats.iqr(values, rng=rng, **kwargs) + + return graph.apply(data, _apply_range) + + +def theil(data: DataFrame | Series, graph: Graph, rng: tuple | list = None): + """ + Calculates the Theil measure of inequality of values within neighbours defined in + ``graph``. Uses ``inequality.theil.Theil`` under the hood. + Requires '`inequality`' package. + + .. math:: + + T = \\sum_{i=1}^n \\left( + \\frac{y_i}{\\sum_{i=1}^n y_i} \\ln \\left[ + N \\frac{y_i} {\\sum_{i=1}^n y_i} + \\right] + \\right) + + Parameters + ---------- + data : DataFrame | Series + A DataFrame or Series containing the values to be analysed. + graph : libpysal.graph.Graph + A spatial weights matrix for the data. + rng : tuple, list, optional (default (0,100))) + A two-element sequence containing floats between 0 and 100 (inclusive) + that are the percentiles over which to compute the range. + The order of the elements is not important. + + Returns + ---------- + Series + A Series containing resulting values. + + Examples + -------- + >>> tessellation_df['area_Theil'] = mm.theil(tessellation_df['area'], + ... graph) + """ + + try: + from inequality.theil import Theil + except ImportError as err: + raise ImportError("The 'inequality' package is required.") from err + if rng: + from momepy import limit_range + + def _apply_theil(values): + if rng: + values = limit_range(values, rng=rng) + return Theil(values).T + + return graph.apply(data, _apply_theil) + + +def simpson( + data: DataFrame | Series, + graph: Graph, + binning: str = "HeadTailBreaks", + gini_simpson: bool = False, + inverse: bool = False, + categorical: bool = False, + **classification_kwds, +): + """ + Calculates the Simpson's diversity index of values within neighbours defined in + ``graph``. Uses ``mapclassify.classifiers`` under the hood for binning. + Requires ``mapclassify>=.2.1.0`` dependency. + + .. math:: + + \\lambda=\\sum_{i=1}^{R} p_{i}^{2} + + Adapted from :cite:`feliciotti2018`. + + Parameters + ---------- + data : DataFrame | Series + A DataFrame or Series containing the values to be analysed. + graph : libpysal.graph.Graph + A spatial weights matrix for the data. + binning : str (default 'HeadTailBreaks') + One of mapclassify classification schemes. For details see + `mapclassify API documentation `_. + gini_simpson : bool (default False) + Return Gini-Simpson index instead of Simpson index (``1 - λ``). + inverse : bool (default False) + Return Inverse Simpson index instead of Simpson index (``1 / λ``). + categorical : bool (default False) + Treat values as categories (will not use ``binning``). + **classification_kwds : dict + Keyword arguments for the classification scheme. + For details see `mapclassify documentation `_. + + Returns + ------- + Series + A Series containing resulting values. + + Examples + -------- + >>> tessellation_df['area_Simpson'] = mm.simpson(tessellation_df['area'], + ... graph) + + See also + -------- + momepy.simpson_diversity : Calculates the Simpson's diversity index of data. + """ + if not categorical: + try: + from mapclassify import classify + except ImportError as err: + raise ImportError( + "The 'mapclassify >= 2.4.2` package is required." + ) from err + bins = classify(data, scheme=binning, **classification_kwds).bins + else: + bins = None + + def _apply_simpson_diversity(values): + return mm.simpson_diversity( + values, + bins, + categorical=categorical, + ) + + result = graph.apply(data, _apply_simpson_diversity) + + if gini_simpson: + result = 1 - result + elif inverse: + result = 1 / result + return result + + +def shannon( + data: DataFrame | Series, + graph: Graph, + binning: str = "HeadTailBreaks", + categorical: bool = False, + categories: list = None, + **classification_kwds, +): + """ + Calculates the Shannon index of values within neighbours defined in + ``graph``. Uses ``mapclassify.classifiers`` under the hood + for binning. Requires ``mapclassify>=.2.1.0`` dependency. + + .. math:: + + H^{\\prime}=-\\sum_{i=1}^{R} p_{i} \\ln p_{i} + + Parameters + ---------- + data : DataFrame | Series + A DataFrame or Series containing the values to be analysed. + graph : libpysal.graph.Graph + A spatial weights matrix for the data. + binning : str (default 'HeadTailBreaks') + One of mapclassify classification schemes. For details see + `mapclassify API documentation `_. + categorical : bool (default False) + Treat values as categories (will not use binning). + categories : list-like (default None) + A list of categories. If ``None``, ``values.unique()`` is used. + **classification_kwds : dict + Keyword arguments for classification scheme + For details see `mapclassify documentation `_. + + Returns + ---------- + Series + A Series containing resulting values. + + Examples + -------- + >>> tessellation_df['area_Shannon'] = mm.shannon(tessellation_df['area'], + ... graph) + """ + + if not categories: + categories = data.unique() + + if not categorical: + try: + from mapclassify import classify + except ImportError as err: + raise ImportError( + "The 'mapclassify >= 2.4.2` package is required." + ) from err + bins = classify(data, scheme=binning, **classification_kwds).bins + else: + bins = categories + + def _apply_shannon(values): + return mm.shannon_diversity(values, bins, categorical, categories) + + return graph.apply(data, _apply_shannon) + + +def gini(data: DataFrame | Series, graph: Graph, rng: tuple | list = None): + """ + Calculates the Gini index of values within neighbours defined in + ``graph``. Uses ``inequality.gini.Gini`` under the hood. + Requires '`inequality`' package. + + .. math:: + + Parameters + ---------- + data : DataFrame | Series + A DataFrame or Series containing the values to be analysed. + graph : libpysal.graph.Graph + A spatial weights matrix for the data. + rng : tuple, list, optional (default (0,100))) + A two-element sequence containing floats between 0 and 100 (inclusive) + that are the percentiles over which to compute the range. + The order of the elements is not important. + + Returns + ---------- + Series + A Series containing resulting values. + + Examples + -------- + >>> tessellation_df['area_Gini'] = mm.gini(tessellation_df['area'], + ... graph) + """ + try: + from inequality.gini import Gini + except ImportError as err: + raise ImportError("The 'inequality' package is required.") from err + + if data.min() < 0: + raise ValueError( + "Values contain negative numbers. Normalise data before" + "using momepy.Gini." + ) + if rng: + from momepy import limit_range + + def _apply_gini(values): + if isinstance(values, Series): + values = values.values + if rng: + values = limit_range(values, rng=rng) + return Gini(values).g + + return graph.apply(data, _apply_gini) + + +def unique(data: DataFrame | Series, graph: Graph, dropna: bool = True): + """ + Calculates the number of unique values within neighbours defined in + ``graph``. + + .. math:: + + + Parameters + ---------- + data : DataFrame | Series + A DataFrame or Series containing the values to be analysed. + graph : libpysal.graph.Graph + A spatial weights matrix for the data. + dropna : bool (default True) + Don’t include ``NaN`` in the counts of unique values. + + Returns + ---------- + Series + A Series containing resulting values. + + Examples + -------- + >>> tessellation_df['cluster_unique'] = mm.Unique(tessellation_df['cluster'], + ... graph) + """ + + def _apply_range(values): + return values.nunique(dropna=dropna) + + return graph.apply(data, _apply_range) diff --git a/momepy/functional/tests/test_diversity.py b/momepy/functional/tests/test_diversity.py index 2fe06319..128f4642 100644 --- a/momepy/functional/tests/test_diversity.py +++ b/momepy/functional/tests/test_diversity.py @@ -1,8 +1,10 @@ import geopandas as gpd +import numpy as np +import pandas as pd import pytest from libpysal.graph import Graph from packaging.version import Version -from pandas.testing import assert_frame_equal +from pandas.testing import assert_frame_equal, assert_series_equal import momepy as mm @@ -15,8 +17,16 @@ class TestDistribution: def setup_method(self): test_file_path = mm.datasets.get_path("bubenec") self.df_buildings = gpd.read_file(test_file_path, layer="buildings") + self.df_tessellation = gpd.read_file(test_file_path, layer="tessellation") + self.df_tessellation["area"] = self.df_tessellation.geometry.area self.graph = Graph.build_knn(self.df_buildings.centroid, k=3) + self.diversity_graph = ( + Graph.build_contiguity(self.df_tessellation) + .higher_order(k=3, lower_order=True) + .assign_self_weight() + ) + def test_describe(self): area = self.df_buildings.area r = mm.describe(area, self.graph) @@ -115,3 +125,357 @@ def test_describe_array(self): r2 = mm.describe(area.values, self.graph) assert_frame_equal(r, r2) + + def test_values_range(self): + full_sw = mm.values_range(self.df_tessellation["area"], self.diversity_graph) + full_sw_expected = { + "count": 144, + "mean": 13575.258680748986, + "min": 3789.0228732928035, + "max": 34510.77694161156, + } + print(np.mean(full_sw)) + assert_result( + full_sw, full_sw_expected, self.df_tessellation, check_names=False + ) + + limit = mm.values_range( + self.df_tessellation["area"], self.diversity_graph, rng=(10, 90) + ) + limit_expected = { + "count": 144, + "mean": 3358.45027554266, + "min": 2080.351522584218, + "max": 5115.169656715312, + } + assert_result(limit, limit_expected, self.df_tessellation, check_names=False) + + def test_theil(self): + full_sw = mm.theil(self.df_tessellation["area"], self.diversity_graph) + full_sw_expected = { + "count": 144, + "mean": 0.3367193709036915, + "min": 0.0935437083870931, + "max": 1.0063687846141105, + } + assert_result( + full_sw, full_sw_expected, self.df_tessellation, check_names=False + ) + + limit = mm.theil( + self.df_tessellation["area"], self.diversity_graph, rng=(10, 90) + ) + limit_expected = { + "count": 144, + "mean": 0.10575479289690606, + "min": 0.04633949101071495, + "max": 0.26582672704556626, + } + + assert_result(limit, limit_expected, self.df_tessellation, check_names=False) + + zeros = mm.theil( + pd.Series(np.zeros(len(self.df_tessellation)), self.df_tessellation.index), + self.graph, + ) + zeros_expected = {"count": 144, "mean": 0, "min": 0, "max": 0.0} + assert_result(zeros, zeros_expected, self.df_tessellation, check_names=False) + + def test_simpson(self): + ht_sw = mm.simpson(self.df_tessellation["area"], self.diversity_graph) + ht_sw_expected = { + "count": 144, + "mean": 0.5106343598245804, + "min": 0.3504, + "max": 0.7159183673469389, + } + assert_result(ht_sw, ht_sw_expected, self.df_tessellation, check_names=False) + + quan_sw = mm.simpson( + self.df_tessellation.area, self.diversity_graph, binning="quantiles", k=3 + ) + quan_sw_expected = { + "count": 144, + "mean": 0.36125200075406005, + "min": 0.3333333333333333, + "max": 0.4609375, + } + assert_result( + quan_sw, quan_sw_expected, self.df_tessellation, check_names=False + ) + + with pytest.raises(ValueError): + mm.simpson(self.df_tessellation.area, self.graph, binning="nonexistent") + + gs = mm.simpson( + self.df_tessellation.area, self.diversity_graph, gini_simpson=True + ) + gs_expected = { + "count": 144, + "mean": 0.4893656401754196, + "min": 0.2840816326530611, + "max": 0.6496, + } + assert_result(gs, gs_expected, self.df_tessellation, check_names=False) + + gs_inv = mm.simpson( + self.df_tessellation.area, self.diversity_graph, inverse=True + ) + gs_inv_expected = { + "count": 144, + "mean": 1.994951794685094, + "min": 1.3968072976054728, + "max": 2.853881278538813, + } + assert_result(gs_inv, gs_inv_expected, self.df_tessellation, check_names=False) + + self.df_tessellation["cat"] = list(range(8)) * 18 + cat = mm.simpson( + self.df_tessellation.cat, self.diversity_graph, categorical=True + ) + cat_expected = { + "count": 144, + "mean": 0.13227361237314683, + "min": 0.1255205234979179, + "max": 0.15625, + } + assert_result(cat, cat_expected, self.df_tessellation, check_names=False) + + def test_gini(self): + with pytest.raises(ValueError): + mm.gini(pd.Series(-1, self.df_tessellation.index), self.diversity_graph) + + full_sw = mm.gini(self.df_tessellation["area"], self.diversity_graph) + full_sw_expected = { + "count": 144, + "mean": 0.38686076469743697, + "min": 0.24235274498955336, + "max": 0.6400687910616315, + } + assert_result( + full_sw, full_sw_expected, self.df_tessellation, check_names=False + ) + + limit = mm.gini( + self.df_tessellation["area"], self.diversity_graph, rng=(10, 90) + ) + limit_expected = { + "count": 144, + "mean": 0.2525181248879755, + "min": 0.17049602697583713, + "max": 0.39018140635767645, + } + assert_result(limit, limit_expected, self.df_tessellation, check_names=False) + + def test_shannon(self): + with pytest.raises(ValueError): + mm.shannon( + self.df_tessellation.area, self.diversity_graph, binning="nonexistent" + ) + + ht_sw = mm.shannon(self.df_tessellation["area"], self.diversity_graph) + ht_sw_expected = { + "count": 144, + "mean": 0.8290031127861055, + "min": 0.4581441790615257, + "max": 1.1626998334975678, + } + assert_result(ht_sw, ht_sw_expected, self.df_tessellation, check_names=False) + + quan_sw = mm.shannon( + self.df_tessellation["area"], self.diversity_graph, binning="quantiles", k=3 + ) + quan_sw_expected = { + "count": 144, + "mean": 1.0543108593712356, + "min": 0.8647400965276372, + "max": 1.0986122886681096, + } + assert_result( + quan_sw, quan_sw_expected, self.df_tessellation, check_names=False + ) + + self.df_tessellation["cat"] = list(range(8)) * 18 + cat = mm.shannon( + self.df_tessellation.cat, self.diversity_graph, categorical=True + ) + cat_expected = { + "count": 144, + "mean": 2.0493812749063793, + "min": 1.9561874676604514, + "max": 2.0774529508369457, + } + assert_result(cat, cat_expected, self.df_tessellation, check_names=False) + + def test_unique(self): + self.df_tessellation["cat"] = list(range(8)) * 18 + un = mm.unique(self.df_tessellation["cat"], self.diversity_graph) + un_expected = {"count": 144, "mean": 8.0, "min": 8, "max": 8} + assert_result(un, un_expected, self.df_tessellation, check_names=False) + + self.df_tessellation.loc[0, "cat"] = np.nan + un_nan = mm.unique( + self.df_tessellation["cat"], self.diversity_graph, dropna=False + ) + un_nan_expected = {"count": 144, "mean": 8.13888888888889, "min": 8, "max": 9} + assert_result(un_nan, un_nan_expected, self.df_tessellation, check_names=False) + + un_nan_drop = mm.unique( + self.df_tessellation["cat"], self.diversity_graph, dropna=True + ) + un_nan_drop_expected = {"count": 144, "mean": 8.0, "min": 8, "max": 8} + assert_result( + un_nan_drop, un_nan_drop_expected, self.df_tessellation, check_names=False + ) + + +class TestDiversityEquivalence: + def setup_method(self): + test_file_path = mm.datasets.get_path("bubenec") + self.df_tessellation = gpd.read_file(test_file_path, layer="tessellation") + self.df_tessellation["area"] = self.df_tessellation.geometry.area + self.sw = mm.sw_high(k=3, gdf=self.df_tessellation, ids="uID") + self.graph = ( + Graph.build_contiguity(self.df_tessellation) + .higher_order(k=3, lower_order=True) + .assign_self_weight() + ) + + def test_values_range(self): + full_sw_new = mm.values_range(self.df_tessellation["area"], self.graph) + full_sw_old = mm.Range(self.df_tessellation, "area", self.sw, "uID").series + assert_series_equal( + full_sw_new, full_sw_old, check_dtype=False, check_names=False + ) + + limit_new = mm.values_range( + self.df_tessellation["area"], self.graph, rng=(10, 90) + ) + limit_old = mm.Range( + self.df_tessellation, "area", self.sw, "uID", rng=(10, 90) + ).series + assert_series_equal(limit_new, limit_old, check_dtype=False, check_names=False) + + def test_theil(self): + full_sw_new = mm.theil(self.df_tessellation["area"], self.graph) + full_sw_old = mm.Theil(self.df_tessellation, "area", self.sw, "uID").series + assert_series_equal( + full_sw_new, full_sw_old, check_dtype=False, check_names=False + ) + + limit_new = mm.theil(self.df_tessellation["area"], self.graph, rng=(10, 90)) + limit_old = mm.Theil( + self.df_tessellation, + self.df_tessellation.area, + self.sw, + "uID", + rng=(10, 90), + ).series + assert_series_equal(limit_new, limit_old, check_dtype=False, check_names=False) + + zeros_new = mm.theil( + pd.Series(np.zeros(len(self.df_tessellation)), self.df_tessellation.index), + self.graph, + ) + zeros_old = mm.Theil( + self.df_tessellation, np.zeros(len(self.df_tessellation)), self.sw, "uID" + ).series + assert_series_equal(zeros_new, zeros_old, check_dtype=False, check_names=False) + + def test_simpson(self): + ht_sw_new = mm.simpson(self.df_tessellation["area"], self.graph) + ht_sw_old = mm.Simpson(self.df_tessellation, "area", self.sw, "uID").series + assert_series_equal(ht_sw_new, ht_sw_old, check_dtype=False, check_names=False) + + quan_sw_new = mm.simpson( + self.df_tessellation.area, self.graph, binning="quantiles", k=3 + ) + quan_sw_old = mm.Simpson( + self.df_tessellation, + self.df_tessellation.area, + self.sw, + "uID", + binning="quantiles", + k=3, + ).series + assert_series_equal( + quan_sw_new, quan_sw_old, check_dtype=False, check_names=False + ) + + gs_new = mm.simpson(self.df_tessellation.area, self.graph, gini_simpson=True) + gs_old = mm.Simpson( + self.df_tessellation, "area", self.sw, "uID", gini_simpson=True + ).series + assert_series_equal(gs_new, gs_old, check_dtype=False, check_names=False) + + gs_new = mm.simpson(self.df_tessellation.area, self.graph, inverse=True) + gs_old = mm.Simpson( + self.df_tessellation, "area", self.sw, "uID", inverse=True + ).series + assert_series_equal(gs_new, gs_old, check_dtype=False, check_names=False) + + self.df_tessellation["cat"] = list(range(8)) * 18 + cat_new = mm.simpson(self.df_tessellation.cat, self.graph, categorical=True) + cat_old = mm.Simpson( + self.df_tessellation, "cat", self.sw, "uID", categorical=True + ).series + assert_series_equal(cat_new, cat_old, check_dtype=False, check_names=False) + + def test_gini(self): + full_sw_new = mm.gini(self.df_tessellation["area"], self.graph) + full_sw_old = mm.Gini(self.df_tessellation, "area", self.sw, "uID").series + assert_series_equal( + full_sw_new, full_sw_old, check_dtype=False, check_names=False + ) + + limit_new = mm.gini(self.df_tessellation["area"], self.graph, rng=(10, 90)) + limit_old = mm.Gini( + self.df_tessellation, "area", self.sw, "uID", rng=(10, 90) + ).series + assert_series_equal(limit_new, limit_old, check_dtype=False, check_names=False) + + def test_shannon(self): + ht_sw_new = mm.shannon(self.df_tessellation["area"], self.graph) + ht_sw_old = mm.Shannon(self.df_tessellation, "area", self.sw, "uID").series + assert_series_equal(ht_sw_new, ht_sw_old, check_dtype=False, check_names=False) + + quan_sw_new = mm.shannon( + self.df_tessellation["area"], self.graph, binning="quantiles", k=3 + ) + quan_sw_old = mm.Shannon( + self.df_tessellation, + self.df_tessellation.area, + self.sw, + "uID", + binning="quantiles", + k=3, + ).series + assert_series_equal( + quan_sw_new, quan_sw_old, check_dtype=False, check_names=False + ) + + self.df_tessellation["cat"] = list(range(8)) * 18 + cat_new = mm.shannon(self.df_tessellation.cat, self.graph, categorical=True) + cat_old = mm.Shannon( + self.df_tessellation, "cat", self.sw, "uID", categorical=True + ).series + assert_series_equal(cat_new, cat_old, check_dtype=False, check_names=False) + + def test_unique(self): + self.df_tessellation["cat"] = list(range(8)) * 18 + un_new = mm.unique(self.df_tessellation["cat"], self.graph) + un_old = mm.Unique(self.df_tessellation, "cat", self.sw, "uID").series + assert_series_equal(un_new, un_old, check_dtype=False, check_names=False) + + self.df_tessellation.loc[0, "cat"] = np.nan + un_new = mm.unique(self.df_tessellation["cat"], self.graph, dropna=False) + un_old = mm.Unique( + self.df_tessellation, "cat", self.sw, "uID", dropna=False + ).series + assert_series_equal(un_new, un_old, check_dtype=False, check_names=False) + + un_new = mm.unique(self.df_tessellation["cat"], self.graph, dropna=True) + un_old = mm.Unique( + self.df_tessellation, "cat", self.sw, "uID", dropna=True + ).series + assert_series_equal(un_new, un_old, check_dtype=False, check_names=False) From 15d1c0781f4b582865e7c98edb5f8edbbec6adc8 Mon Sep 17 00:00:00 2001 From: Krasen Samardzhiev Date: Thu, 6 Jun 2024 14:14:30 +0200 Subject: [PATCH 2/4] typing --- momepy/functional/_diversity.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/momepy/functional/_diversity.py b/momepy/functional/_diversity.py index d9412e0e..55877126 100644 --- a/momepy/functional/_diversity.py +++ b/momepy/functional/_diversity.py @@ -251,7 +251,7 @@ def values_range( return stats[max(q)] - stats[min(q)] -def theil(y: Series, graph: Graph, q: tuple | list = None): +def theil(y: Series, graph: Graph, q: tuple | list | None = None): """Calculates the Theil measure of inequality of values within neighbours defined in ``graph``. Uses ``inequality.theil.Theil`` under the hood. Requires '`inequality`' package. @@ -386,7 +386,7 @@ def shannon( graph: Graph, binning: str = "HeadTailBreaks", categorical: bool = False, - categories: list = None, + categories: list | None = None, **classification_kwds, ): """Calculates the Shannon index of values within neighbours defined in @@ -446,7 +446,7 @@ def _apply_shannon(values): return graph.apply(y, _apply_shannon) -def gini(y: Series, graph: Graph, q: tuple | list = None): +def gini(y: Series, graph: Graph, q: tuple | list | None = None): """Calculates the Gini index of values within neighbours defined in ``graph``. Uses ``inequality.gini.Gini`` under the hood. Requires '`inequality`' package. From 91862a7ae2bc81ff9eea5a8cfcbd610d390fc586 Mon Sep 17 00:00:00 2001 From: Krasen Samardzhiev Date: Tue, 11 Jun 2024 16:44:27 +0200 Subject: [PATCH 3/4] test versionning --- momepy/functional/tests/test_diversity.py | 88 +++++++++++------------ 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/momepy/functional/tests/test_diversity.py b/momepy/functional/tests/test_diversity.py index bc9e834c..82066887 100644 --- a/momepy/functional/tests/test_diversity.py +++ b/momepy/functional/tests/test_diversity.py @@ -215,18 +215,16 @@ def test_theil(self): full_sw2, full_sw_expected, self.df_tessellation, check_names=False ) - ## mismatch between percentile interpolation methods - # limit = mm.theil( - # self.df_tessellation["area"], self.diversity_graph, q=(10, 90) - # ) - # limit_expected = { - # "count": 144, - # "mean": 0.10575479289690606, - # "min": 0.04633949101071495, - # "max": 0.26582672704556626, - # } - - # assert_result(limit, limit_expected, self.df_tessellation, check_names=False) + # mismatch between percentile interpolation methods + limit = mm.theil(self.df_tessellation["area"], self.diversity_graph, q=(10, 90)) + limit_expected = { + "count": 144, + "mean": 0.09689345872019642, + "min": 0.03089398223055910, + "max": 0.2726670141461655, + } + + assert_result(limit, limit_expected, self.df_tessellation, check_names=False) zeros = mm.theil( pd.Series(np.zeros(len(self.df_tessellation)), self.df_tessellation.index), @@ -310,17 +308,15 @@ def test_gini(self): full_sw, full_sw_expected, self.df_tessellation, check_names=False ) - ## mismatch between interpolation methods - # limit = mm.gini( - # self.df_tessellation["area"], self.diversity_graph, q=(10, 90) - # ) - # limit_expected = { - # "count": 144, - # "mean": 0.2525181248879755, - # "min": 0.17049602697583713, - # "max": 0.39018140635767645, - # } - # assert_result(limit, limit_expected, self.df_tessellation, check_names=False) + # mismatch between interpolation methods + limit = mm.gini(self.df_tessellation["area"], self.diversity_graph, q=(10, 90)) + limit_expected = { + "count": 144, + "mean": 0.2417437064941186, + "min": 0.14098983070917345, + "max": 0.3978182288393458, + } + assert_result(limit, limit_expected, self.df_tessellation, check_names=False) def test_shannon(self): with pytest.raises(ValueError): @@ -740,19 +736,21 @@ def test_theil(self): full_sw_new, full_sw_old, check_dtype=False, check_names=False ) - # ## old and new have different percentile interpolation methods - # limit_new = mm.theil( - # self.df_tessellation["area"], self.graph_diversity, q=(10, 90) - # ) - # limit_old = mm.Theil( - # self.df_tessellation, - # self.df_tessellation.area, - # self.sw, - # "uID", - # rng=(10, 90), - # ).series - # assert_series_equal(limit_new, limit_old, - # check_dtype=False, check_names=False) + # old and new have different percentile interpolation methods + # therefore the comparison needs a higher rtol + limit_new = mm.theil( + self.df_tessellation["area"], self.graph_diversity, q=(10, 90) + ) + limit_old = mm.Theil( + self.df_tessellation, + self.df_tessellation.area, + self.sw, + "uID", + rng=(10, 90), + ).series + assert_series_equal( + limit_new, limit_old, rtol=0.5, check_dtype=False, check_names=False + ) zeros_new = mm.theil( pd.Series(np.zeros(len(self.df_tessellation)), self.df_tessellation.index), @@ -816,14 +814,16 @@ def test_gini(self): ) # ## old and new have different interpolation methods - # limit_new = mm.gini( - # self.df_tessellation["area"], self.graph_diversity, q=(10, 90) - # ) - # limit_old = mm.Gini( - # self.df_tessellation, "area", self.sw, "uID", rng=(10, 90) - # ).series - # assert_series_equal(limit_new, limit_old, - # check_dtype=False, check_names=False) + ## there need higher rtol + limit_new = mm.gini( + self.df_tessellation["area"], self.graph_diversity, q=(10, 90) + ) + limit_old = mm.Gini( + self.df_tessellation, "area", self.sw, "uID", rng=(10, 90) + ).series + assert_series_equal( + limit_new, limit_old, rtol=0.3, check_dtype=False, check_names=False + ) def test_shannon(self): ht_sw_new = mm.shannon(self.df_tessellation["area"], self.graph_diversity) From ef230f46f6779767b6ff2e43cfbb6d7d6a16b1a9 Mon Sep 17 00:00:00 2001 From: Martin Fleischmann Date: Wed, 12 Jun 2024 10:03:11 +0200 Subject: [PATCH 4/4] relative imports, types, notes --- momepy/functional/_diversity.py | 46 +++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/momepy/functional/_diversity.py b/momepy/functional/_diversity.py index 55877126..bc144195 100644 --- a/momepy/functional/_diversity.py +++ b/momepy/functional/_diversity.py @@ -8,7 +8,7 @@ from packaging.version import Version from pandas import DataFrame, Series -import momepy as mm +from ..diversity import shannon_diversity, simpson_diversity try: from numba import njit @@ -171,7 +171,7 @@ def describe( Notes ----- - The index of ``values`` must match the index along which the ``graph`` is + The index of ``y`` must match the index along which the ``graph`` is built. The numba package is used extensively in this function to accelerate the computation @@ -219,11 +219,16 @@ def describe( def values_range( y: Series | NDArray[np.float64], graph: Graph, q: tuple | list = (0, 100) -): +) -> Series: """Calculates the range of values within neighbours defined in ``graph``. Adapted from :cite:`dibble2017`. + Notes + ----- + The index of ``y`` must match the index along which the ``graph`` is + built. + Parameters ---------- data : Series @@ -247,13 +252,14 @@ def values_range( ... q=(25, 75)) """ - stats = mm.percentile(y, graph, q=q) + stats = percentile(y, graph, q=q) return stats[max(q)] - stats[min(q)] -def theil(y: Series, graph: Graph, q: tuple | list | None = None): +def theil(y: Series, graph: Graph, q: tuple | list | None = None) -> Series: """Calculates the Theil measure of inequality of values within neighbours defined in ``graph``. + Uses ``inequality.theil.Theil`` under the hood. Requires '`inequality`' package. .. math:: @@ -264,6 +270,11 @@ def theil(y: Series, graph: Graph, q: tuple | list | None = None): \\right] \\right) + Notes + ----- + The index of ``y`` must match the index along which the ``graph`` is + built. + Parameters ---------- y : Series @@ -309,7 +320,7 @@ def simpson( inverse: bool = False, categorical: bool = False, **classification_kwds, -): +) -> Series: """Calculates the Simpson's diversity index of values within neighbours defined in ``graph``. Uses ``mapclassify.classifiers`` under the hood for binning. @@ -321,6 +332,11 @@ def simpson( Adapted from :cite:`feliciotti2018`. + Notes + ----- + The index of ``y`` must match the index along which the ``graph`` is + built. + Parameters ---------- y : Series @@ -366,7 +382,7 @@ def simpson( bins = None def _apply_simpson_diversity(values): - return mm.simpson_diversity( + return simpson_diversity( values, bins, categorical=categorical, @@ -388,7 +404,7 @@ def shannon( categorical: bool = False, categories: list | None = None, **classification_kwds, -): +) -> Series: """Calculates the Shannon index of values within neighbours defined in ``graph``. Uses ``mapclassify.classifiers`` under the hood @@ -398,6 +414,11 @@ def shannon( H^{\\prime}=-\\sum_{i=1}^{R} p_{i} \\ln p_{i} + Notes + ----- + The index of ``y`` must match the index along which the ``graph`` is + built. + Parameters ---------- y : Series @@ -441,16 +462,19 @@ def shannon( bins = categories def _apply_shannon(values): - return mm.shannon_diversity(values, bins, categorical, categories) + return shannon_diversity(values, bins, categorical, categories) return graph.apply(y, _apply_shannon) -def gini(y: Series, graph: Graph, q: tuple | list | None = None): +def gini(y: Series, graph: Graph, q: tuple | list | None = None) -> Series: """Calculates the Gini index of values within neighbours defined in ``graph``. Uses ``inequality.gini.Gini`` under the hood. Requires '`inequality`' package. - .. math:: + Notes + ----- + The index of ``y`` must match the index along which the ``graph`` is + built. Parameters ----------