Merge branch 'master' into issue_957

wasade · May 7, 2024 · c9dd170 · c9dd170
2 parents fc8a0bf + e8e6ed6
commit c9dd170
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 14 deletions.
diff --git a/ChangeLog.md b/ChangeLog.md
@@ -4,8 +4,14 @@ BIOM-Format ChangeLog
 biom 2.1.15-dev
 ---------------
 
-Bug fixes:
+New features:
+
+* NumPy 2.0 support, see issue [#956](https://github.com/biocore/biom-format/issues/956)
+
+Bug Fixes:
 
+* Fixed an edge case on in `align_tree` when a feature was empty, see issue [#948](https://github.com/biocore/biom-format/issues/948)
+* In `subsample(..., with_replacement=True)`, it was possible to trigger a numerical stability on sum, see issue [#952](https://github.com/biocore/biom-format/issues/952)
 * `update_ids(..., strict=False)` could yield truncated IDs, see issue [#957](https://github.com/biocore/biom-format/issues/957)
 
 Performance improvements:
@@ -26,8 +32,7 @@ Bug fixes:
 * Allow `Table.to_json` to properly handle numpy types in metadata, see issue [#886](https://github.com/biocore/biom-format/issues/886)
 * Do not modify IDs in place in the presence of duplicate relabels, see issue [#892](https://github.com/biocore/biom-format/issues/892)
 * Catch an edge case where a failured ID update in place would actually change IDs, see issue [#892](https://github.com/biocore/biom-format/issues/892)
-* Fixed an edge case on in `align_tree` when a feature was empty, see issue [#948](https://github.com/biocore/biom-format/issues/948)
-
+
 New features:
 
 * `biom.parse.save_table` makes saving less tedious, see issue [#897](https://github.com/biocore/biom-format/issues/897)

diff --git a/biom/_subsample.pyx b/biom/_subsample.pyx
@@ -44,13 +44,18 @@ cdef _subsample_with_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
         cnp.int32_t start,end,length
         Py_ssize_t i
         cnp.ndarray[cnp.float64_t, ndim=1] pvals
-
+        cnp.ndarray[cnp.float64_t, ndim=1] data_ceil 
+
+    data_ceil = np.ceil(data)
     for i in range(indptr.shape[0] - 1):
         start, end = indptr[i], indptr[i+1]
         length = end - start
-        counts_sum = data[start:end].sum()
-
-        pvals = data[start:end] / counts_sum
+
+        # base p-values on integer data to avoid small numerical issues with 
+        # float on sum
+        counts_sum = data_ceil[start:end].sum()
+        pvals = data_ceil[start:end] / counts_sum
+
         data[start:end] = rng.multinomial(n, pvals)
 
 

diff --git a/biom/table.py b/biom/table.py
@@ -2919,7 +2919,8 @@ def subsample(self, n, axis='sample', by_id=False, with_replacement=False,
         with_replacement : boolean, optional
             If `False` (default), subsample without replacement. If `True`,
             resample with replacement via the multinomial distribution.
-            Should not be `True` if `by_id` is `True`.
+            Should not be `True` if `by_id` is `True`. Important: If `True`,
+            samples with a sum below `n` are retained.
         seed : int, optional
             If provided, set the numpy random seed with this value
 
@@ -2936,14 +2937,16 @@ def subsample(self, n, axis='sample', by_id=False, with_replacement=False,
 
         Notes
         -----
-        Subsampling is performed without replacement. If `n` is greater than
-        the sum of a given vector, that vector is omitted from the result.
-
-        Adapted from `skbio.math.subsample`, see biom-format/licenses for more
-        information about scikit-bio.
+        If subsampling is performed without replacement, vectors with a sum
+        less than `n` are omitted from the result. This condition is not held
+        when operating with replacement.
 
         This code assumes absolute abundance if `by_id` is False.
 
+        If subsampling with replacement, `np.ceil` is applied prior to
+        calculating p-values to ensure that low-abundance features have a
+        chance to be sampled.
+
         Examples
         --------
         >>> import numpy as np
@@ -4868,7 +4871,7 @@ def to_json(self, generated_by, direct_io=None, creation_date=None):
             for col_index, val in enumerate(obs[0]):
                 if float(val) != 0.0:
                     built_row.append(
-                        "[%d,%d,%r]" % (obs_index, col_index, val)
+                        "[%d,%d,%f]" % (obs_index, col_index, val)
                     )
             if built_row:
                 # if we have written a row already, its safe to add a comma

diff --git a/biom/tests/test_data/edgecase_issue_952.biom b/biom/tests/test_data/edgecase_issue_952.biom
diff --git a/biom/tests/test_table.py b/biom/tests/test_table.py
@@ -3213,6 +3213,23 @@ def f(vals, id_, md):
         with errstate(empty='raise'), self.assertRaises(TableException):
             self.st_rich.filter(f, 'observation')
 
+    def test_subsample_edgecase_issue_952(self):
+        # this file triggers an exception on Linux on subsample
+        # with replacement where the pvals computed sum to > 1. It is a
+        # subset of the data reported in issue 952, specifically constrained
+        # to the first 10 features with any empty samples removed.
+        path = 'test_data/edgecase_issue_952.biom'
+
+        # ...existing logic for test_data, not ideal, but consistent
+        cwd = os.getcwd()
+        if '/' in __file__:
+            os.chdir(__file__.rsplit('/', 1)[0])
+        table = Table.from_hdf5(h5py.File(path, 'r'))
+        os.chdir(cwd)
+
+        obs = table.subsample(10, with_replacement=True)
+        self.assertEqual(set(obs.sum('sample')), {10.0, })
+
     def test_subsample_same_seed_without_replacement(self):
         table = Table(np.array([[3, 1, 2], [0, 3, 4]]), ['O1', 'O2'],
                       ['S1', 'S2', 'S3'])