Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Functional diversity #581

Merged
merged 8 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
316 changes: 315 additions & 1 deletion momepy/functional/_diversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
import numpy as np
import pandas as pd
from libpysal.graph import Graph
from libpysal.graph._utils import _percentile_filtration_grouper
from numpy.typing import NDArray
from packaging.version import Version
from pandas import DataFrame, Series

from ..diversity import shannon_diversity, simpson_diversity

try:
from numba import njit

Expand All @@ -18,10 +21,21 @@
__all__ = [
"describe",
"describe_reached",
"values_range",
"theil",
"simpson",
"shannon",
"gini",
"percentile",
]


def _get_grouper(y, graph):
return y.take(graph._adjacency.index.codes[1]).groupby(
graph._adjacency.index.codes[0]
)


@njit
def _interpolate(values, q):
weights = values[:, 0]
Expand Down Expand Up @@ -157,7 +171,7 @@ def describe(

Notes
-----
The index of ``values`` must match the index along which the ``graph`` is
The index of ``y`` must match the index along which the ``graph`` is
built.

The numba package is used extensively in this function to accelerate the computation
Expand Down Expand Up @@ -203,6 +217,306 @@ def describe(
return _compute_stats(grouper, q, include_mode)


def values_range(
y: Series | NDArray[np.float64], graph: Graph, q: tuple | list = (0, 100)
) -> Series:
"""Calculates the range of values within neighbours defined in ``graph``.

Adapted from :cite:`dibble2017`.

Notes
-----
The index of ``y`` must match the index along which the ``graph`` is
built.

Parameters
----------
data : Series
A DataFrame or Series containing the values to be analysed.
graph : libpysal.graph.Graph
A spatial weights matrix for the data.
q : tuple, list, optional (default (0,100)))
A two-element sequence containing floats between 0 and 100 (inclusive)
that are the percentiles over which to compute the range.
The order of the elements is not important.

Returns
----------
Series
A Series containing resulting values.

Examples
--------
>>> tessellation_df['area_IQR_3steps'] = mm.range(tessellation_df['area'],
... graph,
... q=(25, 75))
"""

stats = percentile(y, graph, q=q)
return stats[max(q)] - stats[min(q)]


def theil(y: Series, graph: Graph, q: tuple | list | None = None) -> Series:
"""Calculates the Theil measure of inequality of values within neighbours defined in
``graph``.

Uses ``inequality.theil.Theil`` under the hood. Requires '`inequality`' package.

.. math::

T = \\sum_{i=1}^n \\left(
\\frac{y_i}{\\sum_{i=1}^n y_i} \\ln \\left[
N \\frac{y_i} {\\sum_{i=1}^n y_i}
\\right]
\\right)

Notes
-----
The index of ``y`` must match the index along which the ``graph`` is
built.

Parameters
----------
y : Series
A DataFrame or Series containing the values to be analysed.
graph : libpysal.graph.Graph
A spatial weights matrix for the data.
q : tuple, list, optional (default (0,100)))
A two-element sequence containing floats between 0 and 100 (inclusive)
that are the percentiles over which to compute the range.
The order of the elements is not important.

Returns
----------
Series
A Series containing resulting values.

Examples
--------
>>> tessellation_df['area_Theil'] = mm.theil(tessellation_df['area'],
... graph)
"""

try:
from inequality.theil import Theil
except ImportError as err:
raise ImportError("The 'inequality' package is required.") from err

if q:
grouper = _percentile_filtration_grouper(y, graph._adjacency.index, q=q)
else:
grouper = _get_grouper(y, graph)

result = grouper.apply(lambda x: Theil(x.values).T)
result.index = graph.unique_ids
return result


def simpson(
y: Series,
graph: Graph,
binning: str = "HeadTailBreaks",
gini_simpson: bool = False,
inverse: bool = False,
categorical: bool = False,
**classification_kwds,
) -> Series:
"""Calculates the Simpson's diversity index of values within neighbours defined in
``graph``.
Uses ``mapclassify.classifiers`` under the hood for binning.
Requires ``mapclassify>=.2.1.0`` dependency.

.. math::

\\lambda=\\sum_{i=1}^{R} p_{i}^{2}

Adapted from :cite:`feliciotti2018`.

Notes
-----
The index of ``y`` must match the index along which the ``graph`` is
built.

Parameters
----------
y : Series
A DataFrame or Series containing the values to be analysed.
graph : libpysal.graph.Graph
A spatial weights matrix for the data.
binning : str (default 'HeadTailBreaks')
One of mapclassify classification schemes. For details see
`mapclassify API documentation <http://pysal.org/mapclassify/api.html>`_.
gini_simpson : bool (default False)
Return Gini-Simpson index instead of Simpson index (``1 - λ``).
inverse : bool (default False)
Return Inverse Simpson index instead of Simpson index (``1 / λ``).
categorical : bool (default False)
Treat values as categories (will not use ``binning``).
**classification_kwds : dict
Keyword arguments for the classification scheme.
For details see `mapclassify documentation <https://pysal.org/mapclassify>`_.

Returns
-------
Series
A Series containing resulting values.

Examples
--------
>>> tessellation_df['area_Simpson'] = mm.simpson(tessellation_df['area'],
... graph)

See also
--------
momepy.simpson_diversity : Calculates the Simpson's diversity index of data.
"""
if not categorical:
try:
from mapclassify import classify
except ImportError as err:
raise ImportError(
"The 'mapclassify >= 2.4.2` package is required."
) from err
bins = classify(y, scheme=binning, **classification_kwds).bins
else:
bins = None

def _apply_simpson_diversity(values):
return simpson_diversity(
values,
bins,
categorical=categorical,
)

result = graph.apply(y, _apply_simpson_diversity)

if gini_simpson:
result = 1 - result
elif inverse:
result = 1 / result
return result


def shannon(
y: Series,
graph: Graph,
binning: str = "HeadTailBreaks",
categorical: bool = False,
categories: list | None = None,
**classification_kwds,
) -> Series:
"""Calculates the Shannon index of values within neighbours defined in
``graph``.
Uses ``mapclassify.classifiers`` under the hood
for binning. Requires ``mapclassify>=.2.1.0`` dependency.

.. math::

H^{\\prime}=-\\sum_{i=1}^{R} p_{i} \\ln p_{i}

Notes
-----
The index of ``y`` must match the index along which the ``graph`` is
built.

Parameters
----------
y : Series
A DataFrame or Series containing the values to be analysed.
graph : libpysal.graph.Graph
A spatial weights matrix for the data.
binning : str (default 'HeadTailBreaks')
One of mapclassify classification schemes. For details see
`mapclassify API documentation <http://pysal.org/mapclassify/api.html>`_.
categorical : bool (default False)
Treat values as categories (will not use binning).
categories : list-like (default None)
A list of categories. If ``None``, ``values.unique()`` is used.
**classification_kwds : dict
Keyword arguments for classification scheme
For details see `mapclassify documentation <https://pysal.org/mapclassify>`_.

Returns
----------
Series
A Series containing resulting values.

Examples
--------
>>> tessellation_df['area_Shannon'] = mm.shannon(tessellation_df['area'],
... graph)
"""

if not categories:
categories = y.unique()

if not categorical:
try:
from mapclassify import classify
except ImportError as err:
raise ImportError(
"The 'mapclassify >= 2.4.2` package is required."
) from err
bins = classify(y, scheme=binning, **classification_kwds).bins
else:
bins = categories

def _apply_shannon(values):
return shannon_diversity(values, bins, categorical, categories)

return graph.apply(y, _apply_shannon)


def gini(y: Series, graph: Graph, q: tuple | list | None = None) -> Series:
"""Calculates the Gini index of values within neighbours defined in ``graph``.
Uses ``inequality.gini.Gini`` under the hood. Requires '`inequality`' package.

Notes
-----
The index of ``y`` must match the index along which the ``graph`` is
built.

Parameters
----------
y : Series
A DataFrame or Series containing the values to be analysed.
graph : libpysal.graph.Graph
A spatial weights matrix for the data.
q : tuple, list, optional (default (0,100)))
A two-element sequence containing floats between 0 and 100 (inclusive)
that are the percentiles over which to compute the range.
The order of the elements is not important.

Returns
----------
Series
A Series containing resulting values.

Examples
--------
>>> tessellation_df['area_Gini'] = mm.gini(tessellation_df['area'],
... graph)
"""
try:
from inequality.gini import Gini
except ImportError as err:
raise ImportError("The 'inequality' package is required.") from err

if y.min() < 0:
raise ValueError(
"Values contain negative numbers. Normalise data before"
"using momepy.Gini."
)
if q:
grouper = _percentile_filtration_grouper(y, graph._adjacency.index, q=q)
else:
grouper = _get_grouper(y, graph)

result = grouper.apply(lambda x: Gini(x.values).g)
result.index = graph.unique_ids
return result


def describe_reached(
y: np.ndarray | Series,
graph_index: np.ndarray | Series,
Expand Down
Loading
Loading