ENH: Synchronize upstream improvements in choice

bashtage · bashtage · commit 69f1c821287d · 2019-12-20T16:17:00.000Z
Bring in upstream changes in choice
diff --git a/.travis.yml b/.travis.yml
@@ -87,4 +87,4 @@ after_success:
       cd ${BUILD_DIR}
       python benchmark.py;
     fi
-  - if [[ "$COVERAGE" = true ]]; then codecov; coveralls --rcfile="$SRCDIR"/.coveragerc; fi
+  - if [[ "$COVERAGE" = true ]]; then codecov; coveralls --rcfile="$SRCDIR"/.coveragerc || true; fi
diff --git a/doc/source/change-log.rst b/doc/source/change-log.rst
@@ -6,6 +6,8 @@ Change Log
 
 v1.18.0
 =======
+- :meth:`~randomgen.generator.Generator.choice` pulled in upstream performance improvement that
+  use a hash set when choosing without replacement and without user-provided probabilities.
 - Added support for :class:`~randomgen.seed_sequence.SeedSequence` (and NumPy's ``SeedSequence``).
 - Fixed a bug that affected both :class:`~randomgen.generator.Generator.randint`
   in :class:`~randomgen.generator.Generator` and :meth:`~randomgen.mtrand.RandomState.randint`
diff --git a/doc/source/index.rst b/doc/source/index.rst
@@ -9,7 +9,7 @@ generator be be changed.
     There are many changes between v1.16.x and v1.18.x. These reflect API
     decision taken in conjunction with NumPy in preparation of the core
     of ``randomgen`` being used as the preferred random number generator in
-    NumPy. These all issue ``DeprecationWarning`` except for ``BasicRNG.generator``
+    NumPy. These all issue ``DeprecationWarning`` except for ``BitGenerator.generator``
     which raises ``NotImplementedError``. The C-API has also changed to reflect
     the preferred naming the underlying Pseudo-RNGs, which are now known as
     bit generators (or ``BitGenerator``).
diff --git a/randomgen/generator.pyx b/randomgen/generator.pyx
@@ -570,11 +570,11 @@ cdef class Generator:
 
         Parameters
         ----------
-        low : int or array-like of ints
+        low : {int, array_like[int]}
             Lowest (signed) integers to be drawn from the distribution (unless
             ``high=None``, in which case this parameter is one above the
             *highest* such integer).
-        high : int or array-like of ints, optional
+        high : {int, array_like[int]}, optional
             If provided, one above the largest (signed) integer to be drawn
             from the distribution (see above for behavior if ``high=None``).
             If array-like, must contain integer values
@@ -648,10 +648,10 @@ cdef class Generator:
                CoRR, Aug. 13, 2018, http://arxiv.org/abs/1805.10941.
 
         """
-        if use_masked is not None:
+        if use_masked is not None and use_masked:
             import warnings
             warnings.warn("use_masked will be removed in the final release and"
-                          "only the Lemire method will be available.",
+                          " only the Lemire method will be available.",
                           DeprecationWarning)
         if closed is not None:
             import warnings
@@ -725,32 +725,34 @@ cdef class Generator:
         return self.integers(0, 4294967296, size=n_uint32, dtype=np.uint32).tobytes()[:length]
 
     @cython.wraparound(True)
-    def choice(self, a, size=None, replace=True, p=None, axis=0):
+    def choice(self, a, size=None, replace=True, p=None, axis=0, bint shuffle=True):
         """
-        choice(a, size=None, replace=True, p=None, axis=0):
-
-        Generates a random sample from a given 1-D array
-
-                .. versionadded:: 1.7.0
+        choice(a, size=None, replace=True, p=None, axis=0, shuffle=True):
 
         Parameters
         ----------
-        a : 1-D array-like or int
+        a : {array_like, int}
             If an ndarray, a random sample is generated from its elements.
             If an int, the random sample is generated as if a were np.arange(a)
         size : int or tuple of ints, optional
-            Output shape. If the given shape is, e.g., ``(m, n, k)``, then
-            ``m * n * k`` samples are drawn. Default is None, in which case a
-            single value is returned.
+            Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+            ``m * n * k`` samples are drawn from the 1-d `a`. If `a` has more
+            than one dimension, the `size` shape will be inserted into the
+            `axis` dimension, so the output ``ndim`` will be ``a.ndim - 1 +
+            len(size)``. Default is None, in which case a single value is
+            returned.
         replace : boolean, optional
             Whether the sample is with or without replacement
-        p : 1-D array-like, optional
+        p : 1-D array_like, optional
             The probabilities associated with each entry in a.
             If not given the sample assumes a uniform distribution over all
             entries in a.
         axis : int, optional
             The axis along which the selection is performed. The default, 0,
             selects by row.
+        shuffle : boolean, optional
+            Whether the sample is shuffled when sampling without replacement.
+            Default is True, False provides a speedup.
 
         Returns
         -------
@@ -805,22 +807,20 @@ cdef class Generator:
               dtype='<U11')
 
         """
-        cdef char* idx_ptr
-        cdef int64_t buf
-        cdef char* buf_ptr
 
-        cdef set idx_set
         cdef int64_t val, t, loc, size_i, pop_size_i
         cdef int64_t *idx_data
         cdef np.npy_intp j
+        cdef uint64_t set_size, mask
+        cdef uint64_t[::1] hash_set
         # Format and Verify input
         a = np.array(a, copy=False)
         if a.ndim == 0:
             try:
                 # __index__ must return an integer by python rules.
                 pop_size = operator.index(a.item())
             except TypeError:
-                raise ValueError("`a` must be 1-dimensional or an integer")
+                raise ValueError("`a` must an array or an integer")
             if pop_size <= 0 and np.prod(size) != 0:
                 raise ValueError("`a` must be greater than 0 unless no samples are taken")
         else:
@@ -837,14 +837,17 @@ cdef class Generator:
                     atol = max(atol, np.sqrt(np.finfo(p.dtype).eps))
 
             p = <np.ndarray>np.PyArray_FROM_OTF(p, np.NPY_DOUBLE, api.NPY_ARRAY_ALIGNED | api.NPY_ARRAY_C_CONTIGUOUS)
-            check_array_constraint(p, "p", CONS_BOUNDED_0_1)
             pix = <double*>np.PyArray_DATA(p)
 
             if p.ndim != 1:
                 raise ValueError("`p` must be 1-dimensional")
             if p.size != pop_size:
                 raise ValueError("`a` and `p` must have same size")
             p_sum = kahan_sum(pix, d)
+            if np.isnan(p_sum):
+                raise ValueError("probabilities contain NaN")
+            if np.logical_or.reduce(p < 0):
+                raise ValueError("probabilities are not non-negative")
             if abs(p_sum - 1.) > atol:
                 raise ValueError("probabilities do not sum to 1")
 
@@ -863,7 +866,7 @@ cdef class Generator:
                 idx = cdf.searchsorted(uniform_samples, side="right")
                 idx = np.array(idx, copy=False, dtype=np.int64)  # searchsorted returns a scalar
             else:
-                idx = self.integers(0, pop_size, size=shape, dtype=np.int64)
+                idx = self.integers(0, pop_size, size=shape, dtype=np.int64, use_masked=False)
         else:
             if size > pop_size:
                 raise ValueError("Cannot take a larger sample than "
@@ -879,7 +882,7 @@ cdef class Generator:
                 found = np.zeros(shape, dtype=np.int64)
                 flat_found = found.ravel()
                 while n_uniq < size:
-                    x = self.random(size - n_uniq)
+                    x = self.random((size - n_uniq,))
                     if n_uniq > 0:
                         p[flat_found[0:n_uniq]] = 0
                     cdf = np.cumsum(p)
@@ -895,36 +898,46 @@ cdef class Generator:
                 size_i = size
                 pop_size_i = pop_size
                 # This is a heuristic tuning. should be improvable
-                if pop_size_i > 200 and (size > 200 or size > (10 * pop_size // size)):
+                if shuffle:
+                    cutoff = 50
+                else:
+                    cutoff = 20
+
+                if pop_size_i > 10000 and (size_i > (pop_size_i // cutoff)):
                     # Tail shuffle size elements
-                    idx = np.arange(pop_size, dtype=np.int64)
-                    idx_ptr = np.PyArray_BYTES(<np.ndarray>idx)
-                    buf_ptr = <char*>&buf
-                    self._shuffle_raw(pop_size_i, max(pop_size_i - size_i, 1),
-                                      8, 8, idx_ptr, buf_ptr)
+                    idx = np.PyArray_Arange(0, pop_size_i, 1, np.NPY_INT64)
+                    idx_data = <int64_t*>np.PyArray_DATA(<np.ndarray>idx)
+                    with self.lock, nogil:
+                        self._shuffle_int(pop_size_i, max(pop_size_i - size_i, 1),
+                                          idx_data)
                     # Copy to allow potentially large array backing idx to be gc
                     idx = idx[(pop_size - size):].copy()
                 else:
-                    # Floyds's algorithm with precomputed indices
-                    # Worst case, O(n**2) when size is close to pop_size
+                    # Floyd's algorithm
                     idx = np.empty(size, dtype=np.int64)
                     idx_data = <int64_t*>np.PyArray_DATA(<np.ndarray>idx)
-                    idx_set = set()
-                    loc = 0
-                    # Sample indices with one pass to avoid reacquiring the lock
-                    with self.lock:
-                        for j in range(pop_size_i - size_i, pop_size_i):
-                            idx_data[loc] = random_interval(&self._bitgen, j)
-                            loc += 1
-                    loc = 0
-                    while len(idx_set) < size_i:
+                    # smallest power of 2 larger than 1.2 * size
+                    set_size = <uint64_t>(1.2 * size_i)
+                    mask = _gen_mask(set_size)
+                    set_size = 1 + mask
+                    hash_set = np.full(set_size, <uint64_t>-1, np.uint64)
+                    with self.lock, cython.wraparound(False), nogil:
                         for j in range(pop_size_i - size_i, pop_size_i):
-                            if idx_data[loc] not in idx_set:
-                                val = idx_data[loc]
-                            else:
-                                idx_data[loc] = val = j
-                            idx_set.add(val)
-                            loc += 1
+                            val = random_bounded_uint64(&self._bitgen, 0, j, 0, 0)
+                            loc = val & mask
+                            while hash_set[loc] != <uint64_t>-1 and hash_set[loc] != <uint64_t>val:
+                                loc = (loc + 1) & mask
+                            if hash_set[loc] == <uint64_t>-1: # then val not in hash_set
+                                hash_set[loc] = val
+                                idx_data[j - pop_size_i + size_i] = val
+                            else: # we need to insert j instead
+                                loc = j & mask
+                                while hash_set[loc] != <uint64_t>-1:
+                                    loc = (loc + 1) & mask
+                                hash_set[loc] = j
+                                idx_data[j - pop_size_i + size_i] = j
+                        if shuffle:
+                            self._shuffle_int(size_i, 1, idx_data)
                 if shape is not None:
                     idx.shape = shape
 
@@ -3908,7 +3921,7 @@ cdef class Generator:
 
         Parameters
         ----------
-        n : int or array-like of ints
+        n : {int, array_like[int]}
             Number of experiments.
         pvals : sequence of floats, length p
             Probabilities of each of the ``p`` different outcomes. These
@@ -4286,6 +4299,28 @@ cdef class Generator:
             string.memcpy(data + j * stride, data + i * stride, itemsize)
             string.memcpy(data + i * stride, buf, itemsize)
 
+    cdef inline void _shuffle_int(self, np.npy_intp n, np.npy_intp first,
+                                  int64_t* data) nogil:
+        """
+        Parameters
+        ----------
+        n
+            Number of elements in data
+        first
+            First observation to shuffle.  Shuffles n-1,
+            n-2, ..., first, so that when first=1 the entire
+            array is shuffled
+        data
+            Location of data
+        """
+        cdef np.npy_intp i, j
+        cdef int64_t temp
+        for i in reversed(range(first, n)):
+            j = random_bounded_uint64(&self._bitgen, 0, i, 0, 0)
+            temp = data[j]
+            data[j] = data[i]
+            data[i] = temp
+
     def permutation(self, object x):
         """
         permutation(x)
diff --git a/randomgen/tests/test_against_numpy.py b/randomgen/tests/test_against_numpy.py
@@ -189,7 +189,7 @@ def test_standard_exponential(self):
                         self.rs.standard_exponential)
         self._is_state_common_legacy()
 
-    @pytest.mark.xfail(reason="Stream broken for simplicity")
+    @pytest.mark.xfail(reason="Stream broken for simplicity", strict=True)
     def test_tomaxint(self):
         self._set_common_state()
         self._is_state_common()
@@ -298,7 +298,7 @@ def test_rand(self):
         with pytest.deprecated_call():
             assert_allclose(f(3, 4, 5), g(3, 4, 5))
 
-    @pytest.mark.xfail(reason="Definition of poisson_lam_max changed")
+    @pytest.mark.xfail(reason="poisson_lam_max changed", strict=True)
     def test_poisson_lam_max(self):
         assert_allclose(self.rg.poisson_lam_max, self.nprs.poisson_lam_max)
 
@@ -309,7 +309,7 @@ def test_triangular(self):
                         self.rg.triangular)
         self._is_state_common()
 
-    @pytest.mark.xfail(reason="Changes to hypergeometic break stream")
+    @pytest.mark.xfail(reason="Changes to hypergeometic", strict=True)
     def test_hypergeometric(self):
         self._set_common_state()
         self._is_state_common()
@@ -339,7 +339,7 @@ def test_multinomial(self):
                      g(100, np.array(p), size=(7, 23)))
         self._is_state_common()
 
-    @pytest.mark.xfail(reason="Stream broken for performance")
+    @pytest.mark.xfail(reason="Stream broken for performance", strict=True)
     def test_choice(self):
         self._set_common_state()
         self._is_state_common()
diff --git a/randomgen/tests/test_final_release_changes.py b/randomgen/tests/test_final_release_changes.py
@@ -58,9 +58,9 @@ def test_integers_closed():
         random_gen.integers(0, 10, closed=False)
 
 
-def test_integers_use_masked(endpoint):
+def test_integers_use_masked():
     with pytest.deprecated_call():
-        random_gen.integers(0, 10, use_masked=endpoint)
+        random_gen.integers(0, 10, use_masked=True)
 
 
 def test_integers_large_negative_value():
diff --git a/randomgen/tests/test_generator_117.py b/randomgen/tests/test_generator_117.py
@@ -1,3 +1,4 @@
+from distutils.version import LooseVersion
 from itertools import product
 
 import numpy as np
@@ -19,6 +20,15 @@
     from randomgen import PCG64
 
 
+v1174 = LooseVersion("1.17.4")
+v118 = LooseVersion("1.18")
+NP_LT_1174 = LooseVersion(np.__version__) < v1174
+NP_LT_1174_OR_GT_118 = NP_LT_1174 or LooseVersion(np.__version__) > v118
+
+pytestmark = pytest.mark.skipif(NP_LT_1174_OR_GT_118,
+                                reason="Only test 1.17.4 to 1.18.x")
+
+
 def positive_param():
     base = Generator(PCG64())
     return [base.chisquare(10),
@@ -221,35 +231,37 @@ def test_permutation():
     assert_array_equal(result, expected)
 
 
-@pytest.mark.xfail(reason="Choice has not been updated")
 @pytest.mark.parametrize("replace", [True, False])
-def test_choice(replace):
-    np_gen.bit_generator.state = initial_state
+def test_choice_with_p(replace):
     x = np.arange(100)
-    expected = np_gen.choice(x, size=10, replace=replace)
+    np_gen.bit_generator.state = initial_state
+    p = (x + 1) / (x + 1).sum()
+    expected = np_gen.choice(x, size=10, replace=replace, p=p)
 
     gen.bit_generator.state = initial_state
-    result = gen.choice(x, size=10, replace=replace)
+    result = gen.choice(x, size=10, replace=replace, p=p)
     assert_array_equal(result, expected)
 
+
+@pytest.mark.parametrize("replace", [True, False])
+def test_choice(replace):
     np_gen.bit_generator.state = initial_state
-    p = (x + 1) / (x + 1).sum()
-    expected = np_gen.choice(x, size=10, replace=replace, p=p)
+    x = np.arange(100)
+    expected = np_gen.choice(x, size=10, replace=replace)
 
     gen.bit_generator.state = initial_state
-    result = gen.choice(x, size=10, replace=replace, p=p)
+    result = gen.choice(x, size=10, replace=replace)
     assert_array_equal(result, expected)
 
 
-@pytest.mark.xfail(reason="Changes to lemire generators")
+@pytest.mark.skipif(NP_LT_1174, reason="Changes to lemire generators")
 @pytest.mark.parametrize("args", integers())
 def test_integers(args):
     np_gen.bit_generator.state = initial_state
     expected = np_gen.integers(*args)
 
     gen.bit_generator.state = initial_state
-    with pytest.deprecated_call():
-        result = gen.integers(*args, use_masked=False)
+    result = gen.integers(*args, use_masked=False)
     assert_array_equal(result, expected)
 
 
diff --git a/randomgen/tests/test_generator_mt19937.py b/randomgen/tests/test_generator_mt19937.py