Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New keyword parameter to cf.histogram: density #795

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ version NEXTVERSION

**2024-??-??**

* New keyword parameter to `cf.histogram`: ``density``
(https://github.com/NCAS-CMS/cf-python/issues/794)
* New method: `cf.Field.is_discrete_axis`
(https://github.com/NCAS-CMS/cf-python/issues/784)
* Include the UM version as a field property when reading UM files
Expand Down
79 changes: 69 additions & 10 deletions cf/maths.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def relative_vorticity(
) # pragman: no cover


def histogram(*digitized):
def histogram(*digitized, density=False):
"""Return the distribution of a set of variables in the form of an
N-dimensional histogram.

Expand Down Expand Up @@ -151,6 +151,17 @@ def histogram(*digitized):
manipulating the dimensions of the digitized field
construct's data to ensure that broadcasting can occur.

density: `bool`, optional
If False, the default, the result will contain the number
of samples in each bin. If True, the result is the value
of the probability density function at the bin, normalized
such that the integral over the range is 1. Note that the
sum of the histogram values will not be equal to 1 unless
bins of unity size are chosen; it is not a probability
mass function.

.. versionadded:: NEXTVERSION

:Returns:

`Field`
Expand Down Expand Up @@ -197,7 +208,7 @@ def histogram(*digitized):
[0.1174 0.1317]
[0.1317 0.146 ]]
>>> h = cf.histogram(indices)
>>> rint(h)
>>> print(h)
Field: number_of_observations
-----------------------------
Data : number_of_observations(specific_humidity(10)) 1
Expand All @@ -216,7 +227,21 @@ def histogram(*digitized):
[0.1031 0.1174]
[0.1174 0.1317]
[0.1317 0.146 ]]
>>> p = cf.histogram(indices, density=True)
>>> print(p)
Field: long_name=probability density function
---------------------------------------------
Data : long_name=probability density function(specific_humidity(10)) 1
Cell methods : latitude: longitude: point
Dimension coords: specific_humidity(10) = [0.01015, ..., 0.13885] 1
>>> print(p.data.round(2).array))
[15.73 12.24 15.73 6.99 8.74 1.75 1.75 1.75 3.5 1.75]

The sum of the density values multiplied by the bin sizes is
unity:

>>> print((p * p.dimension_coordinate().cellsize).sum().array)
1.0

Create a two-dimensional histogram based on specific humidity and
temperature bins. The temperature bins in this example are derived
Expand All @@ -225,8 +250,8 @@ def histogram(*digitized):

>>> g = f.copy()
>>> g.standard_name = 'air_temperature'
>>> import numpy
>>> g[...] = numpy.random.normal(loc=290, scale=10, size=40).reshape(5, 8)
>>> import numpy as np
>>> g[...] = np.arange(40).reshape(5, 8) + 253.15
>>> g.override_units('K', inplace=True)
>>> print(g)
Field: air_temperature (ncvar%q)
Expand All @@ -247,13 +272,28 @@ def histogram(*digitized):
Dimension coords: air_temperature(5) = [281.1054839143287, ..., 313.9741786365939] K
: specific_humidity(10) = [0.01015, ..., 0.13885] 1
>>> print(h.array)
[[2 1 5 3 2 -- -- -- -- --]
[1 1 2 -- 1 -- 1 1 -- --]
[4 4 2 1 1 1 -- -- 1 1]
[1 1 -- -- 1 -- -- -- 1 --]
[1 -- -- -- -- -- -- -- -- --]]
[[3 3 2 -- -- -- -- -- -- --]
[1 1 2 1 3 -- -- -- -- --]
[1 -- -- 1 -- 1 1 1 2 1]
[2 1 1 2 2 -- -- -- -- --]
[2 2 4 -- -- -- -- -- -- --]]
>>> h.sum()
<CF Data(): 40 1>
>>> p = cf.histogram(indices, indices_t, density=True)
>>> print(p)
Field: long_name=probability density function
---------------------------------------------
Data : long_name=probability density function(air_temperature(5), specific_humidity(10)) K-1
Cell methods : latitude: longitude: point
Dimension coords: air_temperature(5) = [257.05, ..., 288.25] K
: specific_humidity(10) = [0.01015, ..., 0.13885] 1
>>> print(p.array)
>>> print(p.data.round(2).array))
[[0.67 0.67 0.45 -- -- -- -- -- -- --]
[0.22 0.22 0.45 0.22 0.67 -- -- -- -- --]
[0.22 -- -- 0.22 -- 0.22 0.22 0.22 0.45 0.22]
[0.45 0.22 0.22 0.45 0.45 -- -- -- -- --]
[0.45 0.45 0.9 -- -- -- -- -- -- --]]

"""
if not digitized:
Expand All @@ -264,7 +304,26 @@ def histogram(*digitized):
f = digitized[0].copy()
f.clear_properties()

return f.bin("sample_size", digitized=digitized)
out = f.bin("sample_size", digitized=digitized)

if density:
# Get the measure of each bin. This is the outer product of
# the cell sizes of each dimension coordinate construct (the
# dimension coordinate constructs define the bins).
data_axes = out.get_data_axes()
dim = out.dimension_coordinate(filter_by_axis=(data_axes[0],))
bin_measures = dim.cellsize
for axis in data_axes[1:]:
dim = out.dimension_coordinate(filter_by_axis=(axis,))
bin_measures.outerproduct(dim.cellsize, inplace=True)

# Convert counts to densities
out /= out.data.sum() * bin_measures

out.del_property("standard_name", None)
out.set_property("long_name", "probability density function")

return out


def curl_xy(fx, fy, x_wrap=None, one_sided_at_boundary=False, radius=None):
Expand Down
50 changes: 50 additions & 0 deletions cf/test/test_Maths.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

faulthandler.enable() # to debug seg faults and timeouts

import numpy as np

import cf


Expand Down Expand Up @@ -207,6 +209,54 @@ def test_differential_operators(self):
zeros[...] = 0
self.assertTrue(cg.data.equals(zeros.data, rtol=0, atol=1e-15))

def test_histogram(self):
f = cf.example_field(0)
g = f.copy()
g.standard_name = "air_temperature"
g[...] = np.arange(40).reshape(5, 8) + 253.15
g.override_units("K", inplace=True)

atol = 1e-15

# 1-d histogram
indices = f.digitize(10)
h = cf.histogram(indices)
self.assertTrue((h.array == [9, 7, 9, 4, 5, 1, 1, 1, 2, 1]).all)
h = cf.histogram(indices, density=True)
self.assertEqual(h.Units, cf.Units("1"))
# Check that integral is 1
bin_measures = h.dimension_coordinate().cellsize
integral = (h * bin_measures).sum()
self.assertTrue(np.allclose(integral.array, 1, rtol=0, atol=atol))

# 2-d histogram
indices_t = g.digitize(5)
h = cf.histogram(indices, indices_t)
self.assertEqual(h.Units, cf.Units())
# The -1 values correspond to missing values in h
self.assertTrue(
(
h.array
== [
[3, 3, 2, -1, -1, -1, -1, -1, -1, -1],
[1, 1, 2, 1, 3, -1, -1, -1, -1, -1],
[1, -1, -1, 1, -1, 1, 1, 1, 2, 1],
[2, 1, 1, 2, 2, -1, -1, -1, -1, -1],
[2, 2, 4, -1, -1, -1, -1, -1, -1, -1],
]
).all()
)

h = cf.histogram(indices, indices_t, density=True)
self.assertEqual(h.Units, cf.Units("K-1"))
# Check that integral is 1
bin_measures = h.dimension_coordinate("air_temperature").cellsize
bin_measures.outerproduct(
h.dimension_coordinate("specific_humidity").cellsize, inplace=True
)
integral = (h * bin_measures).sum()
self.assertTrue(np.allclose(integral.array, 1, rtol=0, atol=atol))


if __name__ == "__main__":
print("Run date:", datetime.datetime.now())
Expand Down