From 66ec9d1c94210d2afc0075349b5caf1082bb8637 Mon Sep 17 00:00:00 2001
From: Krasen Samardzhiev <krasensam@gmail.com>
Date: Sat, 4 May 2024 20:23:33 +0200
Subject: [PATCH 1/4] functional diversity implementation

---
 momepy/functional/_diversity.py           | 327 ++++++++++++++++++-
 momepy/functional/tests/test_diversity.py | 366 +++++++++++++++++++++-
 2 files changed, 691 insertions(+), 2 deletions(-)

diff --git a/momepy/functional/_diversity.py b/momepy/functional/_diversity.py
index 5ee80bf9..8c9829b7 100644
--- a/momepy/functional/_diversity.py
+++ b/momepy/functional/_diversity.py
@@ -1,11 +1,14 @@
 import warnings
 
 import numpy as np
+import scipy as sp
 from libpysal.graph import Graph
 from numpy.typing import NDArray
 from pandas import DataFrame, Series
 
-__all__ = ["describe"]
+import momepy as mm
+
+__all__ = ["describe", "values_range", "theil", "simpson", "shannon", "gini", "unique"]
 
 
 def describe(
@@ -117,3 +120,325 @@ def _describe(values, q, include_mode=False):
         stat_.columns = cols
 
     return stat_
+
+
+def values_range(
+    data: DataFrame | Series, graph: Graph, rng: tuple | list = (0, 100), **kwargs
+):
+    """
+    Calculates the range of values within neighbours defined in ``graph``.
+    Uses ``scipy.stats.iqr`` under the hood.
+
+    Adapted from :cite:`dibble2017`.
+
+    Parameters
+    ----------
+    data : DataFrame | Series
+        A DataFrame or Series containing the values to be analysed.
+    graph : libpysal.graph.Graph
+        A spatial weights matrix for the data.
+    rng : tuple, list, optional (default (0,100)))
+        A two-element sequence containing floats between 0 and 100 (inclusive)
+        that are the percentiles over which to compute the range.
+        The order of the elements is not important.
+    **kwargs : dict
+        Optional arguments for ``scipy.stats.iqr``.
+
+    Returns
+    ----------
+    Series
+        A Series containing resulting values.
+
+    Examples
+    --------
+    >>> tessellation_df['area_IQR_3steps'] = mm.range(tessellation_df['area'],
+    ...                                               graph,
+    ...                                               rng=(25, 75))
+    """
+
+    def _apply_range(values):
+        return sp.stats.iqr(values, rng=rng, **kwargs)
+
+    return graph.apply(data, _apply_range)
+
+
+def theil(data: DataFrame | Series, graph: Graph, rng: tuple | list = None):
+    """
+    Calculates the Theil measure of inequality of values within neighbours defined in
+    ``graph``. Uses ``inequality.theil.Theil`` under the hood.
+    Requires '`inequality`' package.
+
+    .. math::
+
+        T = \\sum_{i=1}^n \\left(
+            \\frac{y_i}{\\sum_{i=1}^n y_i} \\ln \\left[
+                N \\frac{y_i} {\\sum_{i=1}^n y_i}
+            \\right]
+        \\right)
+
+    Parameters
+    ----------
+    data : DataFrame | Series
+        A DataFrame or Series containing the values to be analysed.
+    graph : libpysal.graph.Graph
+        A spatial weights matrix for the data.
+    rng : tuple, list, optional (default (0,100)))
+        A two-element sequence containing floats between 0 and 100 (inclusive)
+        that are the percentiles over which to compute the range.
+        The order of the elements is not important.
+
+    Returns
+    ----------
+    Series
+        A Series containing resulting values.
+
+    Examples
+    --------
+    >>> tessellation_df['area_Theil'] = mm.theil(tessellation_df['area'],
+    ...                                          graph)
+    """
+
+    try:
+        from inequality.theil import Theil
+    except ImportError as err:
+        raise ImportError("The 'inequality' package is required.") from err
+    if rng:
+        from momepy import limit_range
+
+    def _apply_theil(values):
+        if rng:
+            values = limit_range(values, rng=rng)
+        return Theil(values).T
+
+    return graph.apply(data, _apply_theil)
+
+
+def simpson(
+    data: DataFrame | Series,
+    graph: Graph,
+    binning: str = "HeadTailBreaks",
+    gini_simpson: bool = False,
+    inverse: bool = False,
+    categorical: bool = False,
+    **classification_kwds,
+):
+    """
+    Calculates the Simpson's diversity index of values within neighbours defined in
+    ``graph``. Uses ``mapclassify.classifiers`` under the hood for binning.
+    Requires ``mapclassify>=.2.1.0`` dependency.
+
+    .. math::
+
+        \\lambda=\\sum_{i=1}^{R} p_{i}^{2}
+
+    Adapted from :cite:`feliciotti2018`.
+
+    Parameters
+    ----------
+    data : DataFrame | Series
+        A DataFrame or Series containing the values to be analysed.
+    graph : libpysal.graph.Graph
+        A spatial weights matrix for the data.
+    binning : str (default 'HeadTailBreaks')
+        One of mapclassify classification schemes. For details see
+        `mapclassify API documentation <http://pysal.org/mapclassify/api.html>`_.
+    gini_simpson : bool (default False)
+        Return Gini-Simpson index instead of Simpson index (``1 - λ``).
+    inverse : bool (default False)
+        Return Inverse Simpson index instead of Simpson index (``1 / λ``).
+    categorical : bool (default False)
+        Treat values as categories (will not use ``binning``).
+    **classification_kwds : dict
+        Keyword arguments for the classification scheme.
+        For details see `mapclassify documentation <https://pysal.org/mapclassify>`_.
+
+    Returns
+    -------
+    Series
+        A Series containing resulting values.
+
+    Examples
+    --------
+    >>> tessellation_df['area_Simpson'] = mm.simpson(tessellation_df['area'],
+    ...                                              graph)
+
+    See also
+    --------
+    momepy.simpson_diversity : Calculates the Simpson's diversity index of data.
+    """
+    if not categorical:
+        try:
+            from mapclassify import classify
+        except ImportError as err:
+            raise ImportError(
+                "The 'mapclassify >= 2.4.2` package is required."
+            ) from err
+        bins = classify(data, scheme=binning, **classification_kwds).bins
+    else:
+        bins = None
+
+    def _apply_simpson_diversity(values):
+        return mm.simpson_diversity(
+            values,
+            bins,
+            categorical=categorical,
+        )
+
+    result = graph.apply(data, _apply_simpson_diversity)
+
+    if gini_simpson:
+        result = 1 - result
+    elif inverse:
+        result = 1 / result
+    return result
+
+
+def shannon(
+    data: DataFrame | Series,
+    graph: Graph,
+    binning: str = "HeadTailBreaks",
+    categorical: bool = False,
+    categories: list = None,
+    **classification_kwds,
+):
+    """
+    Calculates the Shannon index of values within neighbours defined in
+    ``graph``. Uses ``mapclassify.classifiers`` under the hood
+    for binning. Requires ``mapclassify>=.2.1.0`` dependency.
+
+    .. math::
+
+        H^{\\prime}=-\\sum_{i=1}^{R} p_{i} \\ln p_{i}
+
+    Parameters
+    ----------
+    data : DataFrame | Series
+        A DataFrame or Series containing the values to be analysed.
+    graph : libpysal.graph.Graph
+        A spatial weights matrix for the data.
+    binning : str (default 'HeadTailBreaks')
+        One of mapclassify classification schemes. For details see
+        `mapclassify API documentation <http://pysal.org/mapclassify/api.html>`_.
+    categorical : bool (default False)
+        Treat values as categories (will not use binning).
+    categories : list-like (default None)
+        A list of categories. If ``None``, ``values.unique()`` is used.
+    **classification_kwds : dict
+        Keyword arguments for classification scheme
+        For details see `mapclassify documentation <https://pysal.org/mapclassify>`_.
+
+    Returns
+    ----------
+    Series
+        A Series containing resulting values.
+
+    Examples
+    --------
+    >>> tessellation_df['area_Shannon'] = mm.shannon(tessellation_df['area'],
+    ...                                              graph)
+    """
+
+    if not categories:
+        categories = data.unique()
+
+    if not categorical:
+        try:
+            from mapclassify import classify
+        except ImportError as err:
+            raise ImportError(
+                "The 'mapclassify >= 2.4.2` package is required."
+            ) from err
+        bins = classify(data, scheme=binning, **classification_kwds).bins
+    else:
+        bins = categories
+
+    def _apply_shannon(values):
+        return mm.shannon_diversity(values, bins, categorical, categories)
+
+    return graph.apply(data, _apply_shannon)
+
+
+def gini(data: DataFrame | Series, graph: Graph, rng: tuple | list = None):
+    """
+    Calculates the Gini index of values within neighbours defined in
+    ``graph``. Uses ``inequality.gini.Gini`` under the hood.
+    Requires '`inequality`' package.
+
+    .. math::
+
+    Parameters
+    ----------
+    data : DataFrame | Series
+        A DataFrame or Series containing the values to be analysed.
+    graph : libpysal.graph.Graph
+        A spatial weights matrix for the data.
+    rng : tuple, list, optional (default (0,100)))
+        A two-element sequence containing floats between 0 and 100 (inclusive)
+        that are the percentiles over which to compute the range.
+        The order of the elements is not important.
+
+    Returns
+    ----------
+    Series
+        A Series containing resulting values.
+
+    Examples
+    --------
+    >>> tessellation_df['area_Gini'] = mm.gini(tessellation_df['area'],
+    ...                                              graph)
+    """
+    try:
+        from inequality.gini import Gini
+    except ImportError as err:
+        raise ImportError("The 'inequality' package is required.") from err
+
+    if data.min() < 0:
+        raise ValueError(
+            "Values contain negative numbers. Normalise data before"
+            "using momepy.Gini."
+        )
+    if rng:
+        from momepy import limit_range
+
+    def _apply_gini(values):
+        if isinstance(values, Series):
+            values = values.values
+        if rng:
+            values = limit_range(values, rng=rng)
+        return Gini(values).g
+
+    return graph.apply(data, _apply_gini)
+
+
+def unique(data: DataFrame | Series, graph: Graph, dropna: bool = True):
+    """
+    Calculates the number of unique values within neighbours defined in
+    ``graph``.
+
+    .. math::
+
+
+    Parameters
+    ----------
+    data : DataFrame | Series
+        A DataFrame or Series containing the values to be analysed.
+    graph : libpysal.graph.Graph
+        A spatial weights matrix for the data.
+    dropna : bool (default True)
+        Don’t include ``NaN`` in the counts of unique values.
+
+    Returns
+    ----------
+    Series
+        A Series containing resulting values.
+
+    Examples
+    --------
+    >>> tessellation_df['cluster_unique'] = mm.Unique(tessellation_df['cluster'],
+    ...                                              graph)
+    """
+
+    def _apply_range(values):
+        return values.nunique(dropna=dropna)
+
+    return graph.apply(data, _apply_range)
diff --git a/momepy/functional/tests/test_diversity.py b/momepy/functional/tests/test_diversity.py
index 2fe06319..128f4642 100644
--- a/momepy/functional/tests/test_diversity.py
+++ b/momepy/functional/tests/test_diversity.py
@@ -1,8 +1,10 @@
 import geopandas as gpd
+import numpy as np
+import pandas as pd
 import pytest
 from libpysal.graph import Graph
 from packaging.version import Version
-from pandas.testing import assert_frame_equal
+from pandas.testing import assert_frame_equal, assert_series_equal
 
 import momepy as mm
 
@@ -15,8 +17,16 @@ class TestDistribution:
     def setup_method(self):
         test_file_path = mm.datasets.get_path("bubenec")
         self.df_buildings = gpd.read_file(test_file_path, layer="buildings")
+        self.df_tessellation = gpd.read_file(test_file_path, layer="tessellation")
+        self.df_tessellation["area"] = self.df_tessellation.geometry.area
         self.graph = Graph.build_knn(self.df_buildings.centroid, k=3)
 
+        self.diversity_graph = (
+            Graph.build_contiguity(self.df_tessellation)
+            .higher_order(k=3, lower_order=True)
+            .assign_self_weight()
+        )
+
     def test_describe(self):
         area = self.df_buildings.area
         r = mm.describe(area, self.graph)
@@ -115,3 +125,357 @@ def test_describe_array(self):
         r2 = mm.describe(area.values, self.graph)
 
         assert_frame_equal(r, r2)
+
+    def test_values_range(self):
+        full_sw = mm.values_range(self.df_tessellation["area"], self.diversity_graph)
+        full_sw_expected = {
+            "count": 144,
+            "mean": 13575.258680748986,
+            "min": 3789.0228732928035,
+            "max": 34510.77694161156,
+        }
+        print(np.mean(full_sw))
+        assert_result(
+            full_sw, full_sw_expected, self.df_tessellation, check_names=False
+        )
+
+        limit = mm.values_range(
+            self.df_tessellation["area"], self.diversity_graph, rng=(10, 90)
+        )
+        limit_expected = {
+            "count": 144,
+            "mean": 3358.45027554266,
+            "min": 2080.351522584218,
+            "max": 5115.169656715312,
+        }
+        assert_result(limit, limit_expected, self.df_tessellation, check_names=False)
+
+    def test_theil(self):
+        full_sw = mm.theil(self.df_tessellation["area"], self.diversity_graph)
+        full_sw_expected = {
+            "count": 144,
+            "mean": 0.3367193709036915,
+            "min": 0.0935437083870931,
+            "max": 1.0063687846141105,
+        }
+        assert_result(
+            full_sw, full_sw_expected, self.df_tessellation, check_names=False
+        )
+
+        limit = mm.theil(
+            self.df_tessellation["area"], self.diversity_graph, rng=(10, 90)
+        )
+        limit_expected = {
+            "count": 144,
+            "mean": 0.10575479289690606,
+            "min": 0.04633949101071495,
+            "max": 0.26582672704556626,
+        }
+
+        assert_result(limit, limit_expected, self.df_tessellation, check_names=False)
+
+        zeros = mm.theil(
+            pd.Series(np.zeros(len(self.df_tessellation)), self.df_tessellation.index),
+            self.graph,
+        )
+        zeros_expected = {"count": 144, "mean": 0, "min": 0, "max": 0.0}
+        assert_result(zeros, zeros_expected, self.df_tessellation, check_names=False)
+
+    def test_simpson(self):
+        ht_sw = mm.simpson(self.df_tessellation["area"], self.diversity_graph)
+        ht_sw_expected = {
+            "count": 144,
+            "mean": 0.5106343598245804,
+            "min": 0.3504,
+            "max": 0.7159183673469389,
+        }
+        assert_result(ht_sw, ht_sw_expected, self.df_tessellation, check_names=False)
+
+        quan_sw = mm.simpson(
+            self.df_tessellation.area, self.diversity_graph, binning="quantiles", k=3
+        )
+        quan_sw_expected = {
+            "count": 144,
+            "mean": 0.36125200075406005,
+            "min": 0.3333333333333333,
+            "max": 0.4609375,
+        }
+        assert_result(
+            quan_sw, quan_sw_expected, self.df_tessellation, check_names=False
+        )
+
+        with pytest.raises(ValueError):
+            mm.simpson(self.df_tessellation.area, self.graph, binning="nonexistent")
+
+        gs = mm.simpson(
+            self.df_tessellation.area, self.diversity_graph, gini_simpson=True
+        )
+        gs_expected = {
+            "count": 144,
+            "mean": 0.4893656401754196,
+            "min": 0.2840816326530611,
+            "max": 0.6496,
+        }
+        assert_result(gs, gs_expected, self.df_tessellation, check_names=False)
+
+        gs_inv = mm.simpson(
+            self.df_tessellation.area, self.diversity_graph, inverse=True
+        )
+        gs_inv_expected = {
+            "count": 144,
+            "mean": 1.994951794685094,
+            "min": 1.3968072976054728,
+            "max": 2.853881278538813,
+        }
+        assert_result(gs_inv, gs_inv_expected, self.df_tessellation, check_names=False)
+
+        self.df_tessellation["cat"] = list(range(8)) * 18
+        cat = mm.simpson(
+            self.df_tessellation.cat, self.diversity_graph, categorical=True
+        )
+        cat_expected = {
+            "count": 144,
+            "mean": 0.13227361237314683,
+            "min": 0.1255205234979179,
+            "max": 0.15625,
+        }
+        assert_result(cat, cat_expected, self.df_tessellation, check_names=False)
+
+    def test_gini(self):
+        with pytest.raises(ValueError):
+            mm.gini(pd.Series(-1, self.df_tessellation.index), self.diversity_graph)
+
+        full_sw = mm.gini(self.df_tessellation["area"], self.diversity_graph)
+        full_sw_expected = {
+            "count": 144,
+            "mean": 0.38686076469743697,
+            "min": 0.24235274498955336,
+            "max": 0.6400687910616315,
+        }
+        assert_result(
+            full_sw, full_sw_expected, self.df_tessellation, check_names=False
+        )
+
+        limit = mm.gini(
+            self.df_tessellation["area"], self.diversity_graph, rng=(10, 90)
+        )
+        limit_expected = {
+            "count": 144,
+            "mean": 0.2525181248879755,
+            "min": 0.17049602697583713,
+            "max": 0.39018140635767645,
+        }
+        assert_result(limit, limit_expected, self.df_tessellation, check_names=False)
+
+    def test_shannon(self):
+        with pytest.raises(ValueError):
+            mm.shannon(
+                self.df_tessellation.area, self.diversity_graph, binning="nonexistent"
+            )
+
+        ht_sw = mm.shannon(self.df_tessellation["area"], self.diversity_graph)
+        ht_sw_expected = {
+            "count": 144,
+            "mean": 0.8290031127861055,
+            "min": 0.4581441790615257,
+            "max": 1.1626998334975678,
+        }
+        assert_result(ht_sw, ht_sw_expected, self.df_tessellation, check_names=False)
+
+        quan_sw = mm.shannon(
+            self.df_tessellation["area"], self.diversity_graph, binning="quantiles", k=3
+        )
+        quan_sw_expected = {
+            "count": 144,
+            "mean": 1.0543108593712356,
+            "min": 0.8647400965276372,
+            "max": 1.0986122886681096,
+        }
+        assert_result(
+            quan_sw, quan_sw_expected, self.df_tessellation, check_names=False
+        )
+
+        self.df_tessellation["cat"] = list(range(8)) * 18
+        cat = mm.shannon(
+            self.df_tessellation.cat, self.diversity_graph, categorical=True
+        )
+        cat_expected = {
+            "count": 144,
+            "mean": 2.0493812749063793,
+            "min": 1.9561874676604514,
+            "max": 2.0774529508369457,
+        }
+        assert_result(cat, cat_expected, self.df_tessellation, check_names=False)
+
+    def test_unique(self):
+        self.df_tessellation["cat"] = list(range(8)) * 18
+        un = mm.unique(self.df_tessellation["cat"], self.diversity_graph)
+        un_expected = {"count": 144, "mean": 8.0, "min": 8, "max": 8}
+        assert_result(un, un_expected, self.df_tessellation, check_names=False)
+
+        self.df_tessellation.loc[0, "cat"] = np.nan
+        un_nan = mm.unique(
+            self.df_tessellation["cat"], self.diversity_graph, dropna=False
+        )
+        un_nan_expected = {"count": 144, "mean": 8.13888888888889, "min": 8, "max": 9}
+        assert_result(un_nan, un_nan_expected, self.df_tessellation, check_names=False)
+
+        un_nan_drop = mm.unique(
+            self.df_tessellation["cat"], self.diversity_graph, dropna=True
+        )
+        un_nan_drop_expected = {"count": 144, "mean": 8.0, "min": 8, "max": 8}
+        assert_result(
+            un_nan_drop, un_nan_drop_expected, self.df_tessellation, check_names=False
+        )
+
+
+class TestDiversityEquivalence:
+    def setup_method(self):
+        test_file_path = mm.datasets.get_path("bubenec")
+        self.df_tessellation = gpd.read_file(test_file_path, layer="tessellation")
+        self.df_tessellation["area"] = self.df_tessellation.geometry.area
+        self.sw = mm.sw_high(k=3, gdf=self.df_tessellation, ids="uID")
+        self.graph = (
+            Graph.build_contiguity(self.df_tessellation)
+            .higher_order(k=3, lower_order=True)
+            .assign_self_weight()
+        )
+
+    def test_values_range(self):
+        full_sw_new = mm.values_range(self.df_tessellation["area"], self.graph)
+        full_sw_old = mm.Range(self.df_tessellation, "area", self.sw, "uID").series
+        assert_series_equal(
+            full_sw_new, full_sw_old, check_dtype=False, check_names=False
+        )
+
+        limit_new = mm.values_range(
+            self.df_tessellation["area"], self.graph, rng=(10, 90)
+        )
+        limit_old = mm.Range(
+            self.df_tessellation, "area", self.sw, "uID", rng=(10, 90)
+        ).series
+        assert_series_equal(limit_new, limit_old, check_dtype=False, check_names=False)
+
+    def test_theil(self):
+        full_sw_new = mm.theil(self.df_tessellation["area"], self.graph)
+        full_sw_old = mm.Theil(self.df_tessellation, "area", self.sw, "uID").series
+        assert_series_equal(
+            full_sw_new, full_sw_old, check_dtype=False, check_names=False
+        )
+
+        limit_new = mm.theil(self.df_tessellation["area"], self.graph, rng=(10, 90))
+        limit_old = mm.Theil(
+            self.df_tessellation,
+            self.df_tessellation.area,
+            self.sw,
+            "uID",
+            rng=(10, 90),
+        ).series
+        assert_series_equal(limit_new, limit_old, check_dtype=False, check_names=False)
+
+        zeros_new = mm.theil(
+            pd.Series(np.zeros(len(self.df_tessellation)), self.df_tessellation.index),
+            self.graph,
+        )
+        zeros_old = mm.Theil(
+            self.df_tessellation, np.zeros(len(self.df_tessellation)), self.sw, "uID"
+        ).series
+        assert_series_equal(zeros_new, zeros_old, check_dtype=False, check_names=False)
+
+    def test_simpson(self):
+        ht_sw_new = mm.simpson(self.df_tessellation["area"], self.graph)
+        ht_sw_old = mm.Simpson(self.df_tessellation, "area", self.sw, "uID").series
+        assert_series_equal(ht_sw_new, ht_sw_old, check_dtype=False, check_names=False)
+
+        quan_sw_new = mm.simpson(
+            self.df_tessellation.area, self.graph, binning="quantiles", k=3
+        )
+        quan_sw_old = mm.Simpson(
+            self.df_tessellation,
+            self.df_tessellation.area,
+            self.sw,
+            "uID",
+            binning="quantiles",
+            k=3,
+        ).series
+        assert_series_equal(
+            quan_sw_new, quan_sw_old, check_dtype=False, check_names=False
+        )
+
+        gs_new = mm.simpson(self.df_tessellation.area, self.graph, gini_simpson=True)
+        gs_old = mm.Simpson(
+            self.df_tessellation, "area", self.sw, "uID", gini_simpson=True
+        ).series
+        assert_series_equal(gs_new, gs_old, check_dtype=False, check_names=False)
+
+        gs_new = mm.simpson(self.df_tessellation.area, self.graph, inverse=True)
+        gs_old = mm.Simpson(
+            self.df_tessellation, "area", self.sw, "uID", inverse=True
+        ).series
+        assert_series_equal(gs_new, gs_old, check_dtype=False, check_names=False)
+
+        self.df_tessellation["cat"] = list(range(8)) * 18
+        cat_new = mm.simpson(self.df_tessellation.cat, self.graph, categorical=True)
+        cat_old = mm.Simpson(
+            self.df_tessellation, "cat", self.sw, "uID", categorical=True
+        ).series
+        assert_series_equal(cat_new, cat_old, check_dtype=False, check_names=False)
+
+    def test_gini(self):
+        full_sw_new = mm.gini(self.df_tessellation["area"], self.graph)
+        full_sw_old = mm.Gini(self.df_tessellation, "area", self.sw, "uID").series
+        assert_series_equal(
+            full_sw_new, full_sw_old, check_dtype=False, check_names=False
+        )
+
+        limit_new = mm.gini(self.df_tessellation["area"], self.graph, rng=(10, 90))
+        limit_old = mm.Gini(
+            self.df_tessellation, "area", self.sw, "uID", rng=(10, 90)
+        ).series
+        assert_series_equal(limit_new, limit_old, check_dtype=False, check_names=False)
+
+    def test_shannon(self):
+        ht_sw_new = mm.shannon(self.df_tessellation["area"], self.graph)
+        ht_sw_old = mm.Shannon(self.df_tessellation, "area", self.sw, "uID").series
+        assert_series_equal(ht_sw_new, ht_sw_old, check_dtype=False, check_names=False)
+
+        quan_sw_new = mm.shannon(
+            self.df_tessellation["area"], self.graph, binning="quantiles", k=3
+        )
+        quan_sw_old = mm.Shannon(
+            self.df_tessellation,
+            self.df_tessellation.area,
+            self.sw,
+            "uID",
+            binning="quantiles",
+            k=3,
+        ).series
+        assert_series_equal(
+            quan_sw_new, quan_sw_old, check_dtype=False, check_names=False
+        )
+
+        self.df_tessellation["cat"] = list(range(8)) * 18
+        cat_new = mm.shannon(self.df_tessellation.cat, self.graph, categorical=True)
+        cat_old = mm.Shannon(
+            self.df_tessellation, "cat", self.sw, "uID", categorical=True
+        ).series
+        assert_series_equal(cat_new, cat_old, check_dtype=False, check_names=False)
+
+    def test_unique(self):
+        self.df_tessellation["cat"] = list(range(8)) * 18
+        un_new = mm.unique(self.df_tessellation["cat"], self.graph)
+        un_old = mm.Unique(self.df_tessellation, "cat", self.sw, "uID").series
+        assert_series_equal(un_new, un_old, check_dtype=False, check_names=False)
+
+        self.df_tessellation.loc[0, "cat"] = np.nan
+        un_new = mm.unique(self.df_tessellation["cat"], self.graph, dropna=False)
+        un_old = mm.Unique(
+            self.df_tessellation, "cat", self.sw, "uID", dropna=False
+        ).series
+        assert_series_equal(un_new, un_old, check_dtype=False, check_names=False)
+
+        un_new = mm.unique(self.df_tessellation["cat"], self.graph, dropna=True)
+        un_old = mm.Unique(
+            self.df_tessellation, "cat", self.sw, "uID", dropna=True
+        ).series
+        assert_series_equal(un_new, un_old, check_dtype=False, check_names=False)

From 15d1c0781f4b582865e7c98edb5f8edbbec6adc8 Mon Sep 17 00:00:00 2001
From: Krasen Samardzhiev <krasensam@gmail.com>
Date: Thu, 6 Jun 2024 14:14:30 +0200
Subject: [PATCH 2/4] typing

---
 momepy/functional/_diversity.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/momepy/functional/_diversity.py b/momepy/functional/_diversity.py
index d9412e0e..55877126 100644
--- a/momepy/functional/_diversity.py
+++ b/momepy/functional/_diversity.py
@@ -251,7 +251,7 @@ def values_range(
     return stats[max(q)] - stats[min(q)]
 
 
-def theil(y: Series, graph: Graph, q: tuple | list = None):
+def theil(y: Series, graph: Graph, q: tuple | list | None = None):
     """Calculates the Theil measure of inequality of values within neighbours defined in
     ``graph``.
     Uses ``inequality.theil.Theil`` under the hood. Requires '`inequality`' package.
@@ -386,7 +386,7 @@ def shannon(
     graph: Graph,
     binning: str = "HeadTailBreaks",
     categorical: bool = False,
-    categories: list = None,
+    categories: list | None = None,
     **classification_kwds,
 ):
     """Calculates the Shannon index of values within neighbours defined in
@@ -446,7 +446,7 @@ def _apply_shannon(values):
     return graph.apply(y, _apply_shannon)
 
 
-def gini(y: Series, graph: Graph, q: tuple | list = None):
+def gini(y: Series, graph: Graph, q: tuple | list | None = None):
     """Calculates the Gini index of values within neighbours defined in ``graph``.
     Uses ``inequality.gini.Gini`` under the hood. Requires '`inequality`' package.
 

From 91862a7ae2bc81ff9eea5a8cfcbd610d390fc586 Mon Sep 17 00:00:00 2001
From: Krasen Samardzhiev <krasensam@gmail.com>
Date: Tue, 11 Jun 2024 16:44:27 +0200
Subject: [PATCH 3/4] test versionning

---
 momepy/functional/tests/test_diversity.py | 88 +++++++++++------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/momepy/functional/tests/test_diversity.py b/momepy/functional/tests/test_diversity.py
index bc9e834c..82066887 100644
--- a/momepy/functional/tests/test_diversity.py
+++ b/momepy/functional/tests/test_diversity.py
@@ -215,18 +215,16 @@ def test_theil(self):
             full_sw2, full_sw_expected, self.df_tessellation, check_names=False
         )
 
-        ## mismatch between percentile interpolation methods
-        # limit = mm.theil(
-        #     self.df_tessellation["area"], self.diversity_graph, q=(10, 90)
-        # )
-        # limit_expected = {
-        #     "count": 144,
-        #     "mean": 0.10575479289690606,
-        #     "min": 0.04633949101071495,
-        #     "max": 0.26582672704556626,
-        # }
-
-        # assert_result(limit, limit_expected, self.df_tessellation, check_names=False)
+        # mismatch between percentile interpolation methods
+        limit = mm.theil(self.df_tessellation["area"], self.diversity_graph, q=(10, 90))
+        limit_expected = {
+            "count": 144,
+            "mean": 0.09689345872019642,
+            "min": 0.03089398223055910,
+            "max": 0.2726670141461655,
+        }
+
+        assert_result(limit, limit_expected, self.df_tessellation, check_names=False)
 
         zeros = mm.theil(
             pd.Series(np.zeros(len(self.df_tessellation)), self.df_tessellation.index),
@@ -310,17 +308,15 @@ def test_gini(self):
             full_sw, full_sw_expected, self.df_tessellation, check_names=False
         )
 
-        ## mismatch between interpolation methods
-        # limit = mm.gini(
-        #     self.df_tessellation["area"], self.diversity_graph, q=(10, 90)
-        # )
-        # limit_expected = {
-        #     "count": 144,
-        #     "mean": 0.2525181248879755,
-        #     "min": 0.17049602697583713,
-        #     "max": 0.39018140635767645,
-        # }
-        # assert_result(limit, limit_expected, self.df_tessellation, check_names=False)
+        # mismatch between interpolation methods
+        limit = mm.gini(self.df_tessellation["area"], self.diversity_graph, q=(10, 90))
+        limit_expected = {
+            "count": 144,
+            "mean": 0.2417437064941186,
+            "min": 0.14098983070917345,
+            "max": 0.3978182288393458,
+        }
+        assert_result(limit, limit_expected, self.df_tessellation, check_names=False)
 
     def test_shannon(self):
         with pytest.raises(ValueError):
@@ -740,19 +736,21 @@ def test_theil(self):
             full_sw_new, full_sw_old, check_dtype=False, check_names=False
         )
 
-        # ## old and new have different percentile interpolation methods
-        # limit_new = mm.theil(
-        #     self.df_tessellation["area"], self.graph_diversity, q=(10, 90)
-        # )
-        # limit_old = mm.Theil(
-        #     self.df_tessellation,
-        #     self.df_tessellation.area,
-        #     self.sw,
-        #     "uID",
-        #     rng=(10, 90),
-        # ).series
-        # assert_series_equal(limit_new, limit_old,
-        # check_dtype=False, check_names=False)
+        # old and new have different percentile interpolation methods
+        # therefore the comparison needs a higher rtol
+        limit_new = mm.theil(
+            self.df_tessellation["area"], self.graph_diversity, q=(10, 90)
+        )
+        limit_old = mm.Theil(
+            self.df_tessellation,
+            self.df_tessellation.area,
+            self.sw,
+            "uID",
+            rng=(10, 90),
+        ).series
+        assert_series_equal(
+            limit_new, limit_old, rtol=0.5, check_dtype=False, check_names=False
+        )
 
         zeros_new = mm.theil(
             pd.Series(np.zeros(len(self.df_tessellation)), self.df_tessellation.index),
@@ -816,14 +814,16 @@ def test_gini(self):
         )
 
         # ## old and new have different interpolation methods
-        # limit_new = mm.gini(
-        #     self.df_tessellation["area"], self.graph_diversity, q=(10, 90)
-        # )
-        # limit_old = mm.Gini(
-        #     self.df_tessellation, "area", self.sw, "uID", rng=(10, 90)
-        # ).series
-        # assert_series_equal(limit_new, limit_old,
-        #  check_dtype=False, check_names=False)
+        ## there need higher rtol
+        limit_new = mm.gini(
+            self.df_tessellation["area"], self.graph_diversity, q=(10, 90)
+        )
+        limit_old = mm.Gini(
+            self.df_tessellation, "area", self.sw, "uID", rng=(10, 90)
+        ).series
+        assert_series_equal(
+            limit_new, limit_old, rtol=0.3, check_dtype=False, check_names=False
+        )
 
     def test_shannon(self):
         ht_sw_new = mm.shannon(self.df_tessellation["area"], self.graph_diversity)

From ef230f46f6779767b6ff2e43cfbb6d7d6a16b1a9 Mon Sep 17 00:00:00 2001
From: Martin Fleischmann <martin@martinfleischmann.net>
Date: Wed, 12 Jun 2024 10:03:11 +0200
Subject: [PATCH 4/4] relative imports, types, notes

---
 momepy/functional/_diversity.py | 46 +++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/momepy/functional/_diversity.py b/momepy/functional/_diversity.py
index 55877126..bc144195 100644
--- a/momepy/functional/_diversity.py
+++ b/momepy/functional/_diversity.py
@@ -8,7 +8,7 @@
 from packaging.version import Version
 from pandas import DataFrame, Series
 
-import momepy as mm
+from ..diversity import shannon_diversity, simpson_diversity
 
 try:
     from numba import njit
@@ -171,7 +171,7 @@ def describe(
 
     Notes
     -----
-    The index of ``values`` must match the index along which the ``graph`` is
+    The index of ``y`` must match the index along which the ``graph`` is
     built.
 
     The numba package is used extensively in this function to accelerate the computation
@@ -219,11 +219,16 @@ def describe(
 
 def values_range(
     y: Series | NDArray[np.float64], graph: Graph, q: tuple | list = (0, 100)
-):
+) -> Series:
     """Calculates the range of values within neighbours defined in ``graph``.
 
     Adapted from :cite:`dibble2017`.
 
+    Notes
+    -----
+    The index of ``y`` must match the index along which the ``graph`` is
+    built.
+
     Parameters
     ----------
     data : Series
@@ -247,13 +252,14 @@ def values_range(
     ...                                               q=(25, 75))
     """
 
-    stats = mm.percentile(y, graph, q=q)
+    stats = percentile(y, graph, q=q)
     return stats[max(q)] - stats[min(q)]
 
 
-def theil(y: Series, graph: Graph, q: tuple | list | None = None):
+def theil(y: Series, graph: Graph, q: tuple | list | None = None) -> Series:
     """Calculates the Theil measure of inequality of values within neighbours defined in
     ``graph``.
+
     Uses ``inequality.theil.Theil`` under the hood. Requires '`inequality`' package.
 
     .. math::
@@ -264,6 +270,11 @@ def theil(y: Series, graph: Graph, q: tuple | list | None = None):
             \\right]
         \\right)
 
+    Notes
+    -----
+    The index of ``y`` must match the index along which the ``graph`` is
+    built.
+
     Parameters
     ----------
     y : Series
@@ -309,7 +320,7 @@ def simpson(
     inverse: bool = False,
     categorical: bool = False,
     **classification_kwds,
-):
+) -> Series:
     """Calculates the Simpson's diversity index of values within neighbours defined in
     ``graph``.
     Uses ``mapclassify.classifiers`` under the hood for binning.
@@ -321,6 +332,11 @@ def simpson(
 
     Adapted from :cite:`feliciotti2018`.
 
+    Notes
+    -----
+    The index of ``y`` must match the index along which the ``graph`` is
+    built.
+
     Parameters
     ----------
     y : Series
@@ -366,7 +382,7 @@ def simpson(
         bins = None
 
     def _apply_simpson_diversity(values):
-        return mm.simpson_diversity(
+        return simpson_diversity(
             values,
             bins,
             categorical=categorical,
@@ -388,7 +404,7 @@ def shannon(
     categorical: bool = False,
     categories: list | None = None,
     **classification_kwds,
-):
+) -> Series:
     """Calculates the Shannon index of values within neighbours defined in
     ``graph``.
     Uses ``mapclassify.classifiers`` under the hood
@@ -398,6 +414,11 @@ def shannon(
 
         H^{\\prime}=-\\sum_{i=1}^{R} p_{i} \\ln p_{i}
 
+    Notes
+    -----
+    The index of ``y`` must match the index along which the ``graph`` is
+    built.
+
     Parameters
     ----------
     y : Series
@@ -441,16 +462,19 @@ def shannon(
         bins = categories
 
     def _apply_shannon(values):
-        return mm.shannon_diversity(values, bins, categorical, categories)
+        return shannon_diversity(values, bins, categorical, categories)
 
     return graph.apply(y, _apply_shannon)
 
 
-def gini(y: Series, graph: Graph, q: tuple | list | None = None):
+def gini(y: Series, graph: Graph, q: tuple | list | None = None) -> Series:
     """Calculates the Gini index of values within neighbours defined in ``graph``.
     Uses ``inequality.gini.Gini`` under the hood. Requires '`inequality`' package.
 
-    .. math::
+    Notes
+    -----
+    The index of ``y`` must match the index along which the ``graph`` is
+    built.
 
     Parameters
     ----------