Skip to content

Commit 69f1c82

Browse files
committed
ENH: Synchronize upstream improvements in choice
Bring in upstream changes in choice
1 parent 8a07c1d commit 69f1c82

8 files changed

+125
-78
lines changed

.travis.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -87,4 +87,4 @@ after_success:
8787
cd ${BUILD_DIR}
8888
python benchmark.py;
8989
fi
90-
- if [[ "$COVERAGE" = true ]]; then codecov; coveralls --rcfile="$SRCDIR"/.coveragerc; fi
90+
- if [[ "$COVERAGE" = true ]]; then codecov; coveralls --rcfile="$SRCDIR"/.coveragerc || true; fi

doc/source/change-log.rst

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ Change Log
66

77
v1.18.0
88
=======
9+
- :meth:`~randomgen.generator.Generator.choice` pulled in upstream performance improvement that
10+
use a hash set when choosing without replacement and without user-provided probabilities.
911
- Added support for :class:`~randomgen.seed_sequence.SeedSequence` (and NumPy's ``SeedSequence``).
1012
- Fixed a bug that affected both :class:`~randomgen.generator.Generator.randint`
1113
in :class:`~randomgen.generator.Generator` and :meth:`~randomgen.mtrand.RandomState.randint`

doc/source/index.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ generator be be changed.
99
There are many changes between v1.16.x and v1.18.x. These reflect API
1010
decision taken in conjunction with NumPy in preparation of the core
1111
of ``randomgen`` being used as the preferred random number generator in
12-
NumPy. These all issue ``DeprecationWarning`` except for ``BasicRNG.generator``
12+
NumPy. These all issue ``DeprecationWarning`` except for ``BitGenerator.generator``
1313
which raises ``NotImplementedError``. The C-API has also changed to reflect
1414
the preferred naming the underlying Pseudo-RNGs, which are now known as
1515
bit generators (or ``BitGenerator``).

randomgen/generator.pyx

+82-47
Original file line numberDiff line numberDiff line change
@@ -570,11 +570,11 @@ cdef class Generator:
570570
571571
Parameters
572572
----------
573-
low : int or array-like of ints
573+
low : {int, array_like[int]}
574574
Lowest (signed) integers to be drawn from the distribution (unless
575575
``high=None``, in which case this parameter is one above the
576576
*highest* such integer).
577-
high : int or array-like of ints, optional
577+
high : {int, array_like[int]}, optional
578578
If provided, one above the largest (signed) integer to be drawn
579579
from the distribution (see above for behavior if ``high=None``).
580580
If array-like, must contain integer values
@@ -648,10 +648,10 @@ cdef class Generator:
648648
CoRR, Aug. 13, 2018, http://arxiv.org/abs/1805.10941.
649649
650650
"""
651-
if use_masked is not None:
651+
if use_masked is not None and use_masked:
652652
import warnings
653653
warnings.warn("use_masked will be removed in the final release and"
654-
"only the Lemire method will be available.",
654+
" only the Lemire method will be available.",
655655
DeprecationWarning)
656656
if closed is not None:
657657
import warnings
@@ -725,32 +725,34 @@ cdef class Generator:
725725
return self.integers(0, 4294967296, size=n_uint32, dtype=np.uint32).tobytes()[:length]
726726

727727
@cython.wraparound(True)
728-
def choice(self, a, size=None, replace=True, p=None, axis=0):
728+
def choice(self, a, size=None, replace=True, p=None, axis=0, bint shuffle=True):
729729
"""
730-
choice(a, size=None, replace=True, p=None, axis=0):
731-
732-
Generates a random sample from a given 1-D array
733-
734-
.. versionadded:: 1.7.0
730+
choice(a, size=None, replace=True, p=None, axis=0, shuffle=True):
735731
736732
Parameters
737733
----------
738-
a : 1-D array-like or int
734+
a : {array_like, int}
739735
If an ndarray, a random sample is generated from its elements.
740736
If an int, the random sample is generated as if a were np.arange(a)
741737
size : int or tuple of ints, optional
742-
Output shape. If the given shape is, e.g., ``(m, n, k)``, then
743-
``m * n * k`` samples are drawn. Default is None, in which case a
744-
single value is returned.
738+
Output shape. If the given shape is, e.g., ``(m, n, k)``, then
739+
``m * n * k`` samples are drawn from the 1-d `a`. If `a` has more
740+
than one dimension, the `size` shape will be inserted into the
741+
`axis` dimension, so the output ``ndim`` will be ``a.ndim - 1 +
742+
len(size)``. Default is None, in which case a single value is
743+
returned.
745744
replace : boolean, optional
746745
Whether the sample is with or without replacement
747-
p : 1-D array-like, optional
746+
p : 1-D array_like, optional
748747
The probabilities associated with each entry in a.
749748
If not given the sample assumes a uniform distribution over all
750749
entries in a.
751750
axis : int, optional
752751
The axis along which the selection is performed. The default, 0,
753752
selects by row.
753+
shuffle : boolean, optional
754+
Whether the sample is shuffled when sampling without replacement.
755+
Default is True, False provides a speedup.
754756
755757
Returns
756758
-------
@@ -805,22 +807,20 @@ cdef class Generator:
805807
dtype='<U11')
806808
807809
"""
808-
cdef char* idx_ptr
809-
cdef int64_t buf
810-
cdef char* buf_ptr
811810

812-
cdef set idx_set
813811
cdef int64_t val, t, loc, size_i, pop_size_i
814812
cdef int64_t *idx_data
815813
cdef np.npy_intp j
814+
cdef uint64_t set_size, mask
815+
cdef uint64_t[::1] hash_set
816816
# Format and Verify input
817817
a = np.array(a, copy=False)
818818
if a.ndim == 0:
819819
try:
820820
# __index__ must return an integer by python rules.
821821
pop_size = operator.index(a.item())
822822
except TypeError:
823-
raise ValueError("`a` must be 1-dimensional or an integer")
823+
raise ValueError("`a` must an array or an integer")
824824
if pop_size <= 0 and np.prod(size) != 0:
825825
raise ValueError("`a` must be greater than 0 unless no samples are taken")
826826
else:
@@ -837,14 +837,17 @@ cdef class Generator:
837837
atol = max(atol, np.sqrt(np.finfo(p.dtype).eps))
838838

839839
p = <np.ndarray>np.PyArray_FROM_OTF(p, np.NPY_DOUBLE, api.NPY_ARRAY_ALIGNED | api.NPY_ARRAY_C_CONTIGUOUS)
840-
check_array_constraint(p, "p", CONS_BOUNDED_0_1)
841840
pix = <double*>np.PyArray_DATA(p)
842841

843842
if p.ndim != 1:
844843
raise ValueError("`p` must be 1-dimensional")
845844
if p.size != pop_size:
846845
raise ValueError("`a` and `p` must have same size")
847846
p_sum = kahan_sum(pix, d)
847+
if np.isnan(p_sum):
848+
raise ValueError("probabilities contain NaN")
849+
if np.logical_or.reduce(p < 0):
850+
raise ValueError("probabilities are not non-negative")
848851
if abs(p_sum - 1.) > atol:
849852
raise ValueError("probabilities do not sum to 1")
850853

@@ -863,7 +866,7 @@ cdef class Generator:
863866
idx = cdf.searchsorted(uniform_samples, side="right")
864867
idx = np.array(idx, copy=False, dtype=np.int64) # searchsorted returns a scalar
865868
else:
866-
idx = self.integers(0, pop_size, size=shape, dtype=np.int64)
869+
idx = self.integers(0, pop_size, size=shape, dtype=np.int64, use_masked=False)
867870
else:
868871
if size > pop_size:
869872
raise ValueError("Cannot take a larger sample than "
@@ -879,7 +882,7 @@ cdef class Generator:
879882
found = np.zeros(shape, dtype=np.int64)
880883
flat_found = found.ravel()
881884
while n_uniq < size:
882-
x = self.random(size - n_uniq)
885+
x = self.random((size - n_uniq,))
883886
if n_uniq > 0:
884887
p[flat_found[0:n_uniq]] = 0
885888
cdf = np.cumsum(p)
@@ -895,36 +898,46 @@ cdef class Generator:
895898
size_i = size
896899
pop_size_i = pop_size
897900
# This is a heuristic tuning. should be improvable
898-
if pop_size_i > 200 and (size > 200 or size > (10 * pop_size // size)):
901+
if shuffle:
902+
cutoff = 50
903+
else:
904+
cutoff = 20
905+
906+
if pop_size_i > 10000 and (size_i > (pop_size_i // cutoff)):
899907
# Tail shuffle size elements
900-
idx = np.arange(pop_size, dtype=np.int64)
901-
idx_ptr = np.PyArray_BYTES(<np.ndarray>idx)
902-
buf_ptr = <char*>&buf
903-
self._shuffle_raw(pop_size_i, max(pop_size_i - size_i, 1),
904-
8, 8, idx_ptr, buf_ptr)
908+
idx = np.PyArray_Arange(0, pop_size_i, 1, np.NPY_INT64)
909+
idx_data = <int64_t*>np.PyArray_DATA(<np.ndarray>idx)
910+
with self.lock, nogil:
911+
self._shuffle_int(pop_size_i, max(pop_size_i - size_i, 1),
912+
idx_data)
905913
# Copy to allow potentially large array backing idx to be gc
906914
idx = idx[(pop_size - size):].copy()
907915
else:
908-
# Floyds's algorithm with precomputed indices
909-
# Worst case, O(n**2) when size is close to pop_size
916+
# Floyd's algorithm
910917
idx = np.empty(size, dtype=np.int64)
911918
idx_data = <int64_t*>np.PyArray_DATA(<np.ndarray>idx)
912-
idx_set = set()
913-
loc = 0
914-
# Sample indices with one pass to avoid reacquiring the lock
915-
with self.lock:
916-
for j in range(pop_size_i - size_i, pop_size_i):
917-
idx_data[loc] = random_interval(&self._bitgen, j)
918-
loc += 1
919-
loc = 0
920-
while len(idx_set) < size_i:
919+
# smallest power of 2 larger than 1.2 * size
920+
set_size = <uint64_t>(1.2 * size_i)
921+
mask = _gen_mask(set_size)
922+
set_size = 1 + mask
923+
hash_set = np.full(set_size, <uint64_t>-1, np.uint64)
924+
with self.lock, cython.wraparound(False), nogil:
921925
for j in range(pop_size_i - size_i, pop_size_i):
922-
if idx_data[loc] not in idx_set:
923-
val = idx_data[loc]
924-
else:
925-
idx_data[loc] = val = j
926-
idx_set.add(val)
927-
loc += 1
926+
val = random_bounded_uint64(&self._bitgen, 0, j, 0, 0)
927+
loc = val & mask
928+
while hash_set[loc] != <uint64_t>-1 and hash_set[loc] != <uint64_t>val:
929+
loc = (loc + 1) & mask
930+
if hash_set[loc] == <uint64_t>-1: # then val not in hash_set
931+
hash_set[loc] = val
932+
idx_data[j - pop_size_i + size_i] = val
933+
else: # we need to insert j instead
934+
loc = j & mask
935+
while hash_set[loc] != <uint64_t>-1:
936+
loc = (loc + 1) & mask
937+
hash_set[loc] = j
938+
idx_data[j - pop_size_i + size_i] = j
939+
if shuffle:
940+
self._shuffle_int(size_i, 1, idx_data)
928941
if shape is not None:
929942
idx.shape = shape
930943

@@ -3908,7 +3921,7 @@ cdef class Generator:
39083921
39093922
Parameters
39103923
----------
3911-
n : int or array-like of ints
3924+
n : {int, array_like[int]}
39123925
Number of experiments.
39133926
pvals : sequence of floats, length p
39143927
Probabilities of each of the ``p`` different outcomes. These
@@ -4286,6 +4299,28 @@ cdef class Generator:
42864299
string.memcpy(data + j * stride, data + i * stride, itemsize)
42874300
string.memcpy(data + i * stride, buf, itemsize)
42884301

4302+
cdef inline void _shuffle_int(self, np.npy_intp n, np.npy_intp first,
4303+
int64_t* data) nogil:
4304+
"""
4305+
Parameters
4306+
----------
4307+
n
4308+
Number of elements in data
4309+
first
4310+
First observation to shuffle. Shuffles n-1,
4311+
n-2, ..., first, so that when first=1 the entire
4312+
array is shuffled
4313+
data
4314+
Location of data
4315+
"""
4316+
cdef np.npy_intp i, j
4317+
cdef int64_t temp
4318+
for i in reversed(range(first, n)):
4319+
j = random_bounded_uint64(&self._bitgen, 0, i, 0, 0)
4320+
temp = data[j]
4321+
data[j] = data[i]
4322+
data[i] = temp
4323+
42894324
def permutation(self, object x):
42904325
"""
42914326
permutation(x)

randomgen/tests/test_against_numpy.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ def test_standard_exponential(self):
189189
self.rs.standard_exponential)
190190
self._is_state_common_legacy()
191191

192-
@pytest.mark.xfail(reason="Stream broken for simplicity")
192+
@pytest.mark.xfail(reason="Stream broken for simplicity", strict=True)
193193
def test_tomaxint(self):
194194
self._set_common_state()
195195
self._is_state_common()
@@ -298,7 +298,7 @@ def test_rand(self):
298298
with pytest.deprecated_call():
299299
assert_allclose(f(3, 4, 5), g(3, 4, 5))
300300

301-
@pytest.mark.xfail(reason="Definition of poisson_lam_max changed")
301+
@pytest.mark.xfail(reason="poisson_lam_max changed", strict=True)
302302
def test_poisson_lam_max(self):
303303
assert_allclose(self.rg.poisson_lam_max, self.nprs.poisson_lam_max)
304304

@@ -309,7 +309,7 @@ def test_triangular(self):
309309
self.rg.triangular)
310310
self._is_state_common()
311311

312-
@pytest.mark.xfail(reason="Changes to hypergeometic break stream")
312+
@pytest.mark.xfail(reason="Changes to hypergeometic", strict=True)
313313
def test_hypergeometric(self):
314314
self._set_common_state()
315315
self._is_state_common()
@@ -339,7 +339,7 @@ def test_multinomial(self):
339339
g(100, np.array(p), size=(7, 23)))
340340
self._is_state_common()
341341

342-
@pytest.mark.xfail(reason="Stream broken for performance")
342+
@pytest.mark.xfail(reason="Stream broken for performance", strict=True)
343343
def test_choice(self):
344344
self._set_common_state()
345345
self._is_state_common()

randomgen/tests/test_final_release_changes.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,9 @@ def test_integers_closed():
5858
random_gen.integers(0, 10, closed=False)
5959

6060

61-
def test_integers_use_masked(endpoint):
61+
def test_integers_use_masked():
6262
with pytest.deprecated_call():
63-
random_gen.integers(0, 10, use_masked=endpoint)
63+
random_gen.integers(0, 10, use_masked=True)
6464

6565

6666
def test_integers_large_negative_value():

randomgen/tests/test_generator_117.py

+23-11
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from distutils.version import LooseVersion
12
from itertools import product
23

34
import numpy as np
@@ -19,6 +20,15 @@
1920
from randomgen import PCG64
2021

2122

23+
v1174 = LooseVersion("1.17.4")
24+
v118 = LooseVersion("1.18")
25+
NP_LT_1174 = LooseVersion(np.__version__) < v1174
26+
NP_LT_1174_OR_GT_118 = NP_LT_1174 or LooseVersion(np.__version__) > v118
27+
28+
pytestmark = pytest.mark.skipif(NP_LT_1174_OR_GT_118,
29+
reason="Only test 1.17.4 to 1.18.x")
30+
31+
2232
def positive_param():
2333
base = Generator(PCG64())
2434
return [base.chisquare(10),
@@ -221,35 +231,37 @@ def test_permutation():
221231
assert_array_equal(result, expected)
222232

223233

224-
@pytest.mark.xfail(reason="Choice has not been updated")
225234
@pytest.mark.parametrize("replace", [True, False])
226-
def test_choice(replace):
227-
np_gen.bit_generator.state = initial_state
235+
def test_choice_with_p(replace):
228236
x = np.arange(100)
229-
expected = np_gen.choice(x, size=10, replace=replace)
237+
np_gen.bit_generator.state = initial_state
238+
p = (x + 1) / (x + 1).sum()
239+
expected = np_gen.choice(x, size=10, replace=replace, p=p)
230240

231241
gen.bit_generator.state = initial_state
232-
result = gen.choice(x, size=10, replace=replace)
242+
result = gen.choice(x, size=10, replace=replace, p=p)
233243
assert_array_equal(result, expected)
234244

245+
246+
@pytest.mark.parametrize("replace", [True, False])
247+
def test_choice(replace):
235248
np_gen.bit_generator.state = initial_state
236-
p = (x + 1) / (x + 1).sum()
237-
expected = np_gen.choice(x, size=10, replace=replace, p=p)
249+
x = np.arange(100)
250+
expected = np_gen.choice(x, size=10, replace=replace)
238251

239252
gen.bit_generator.state = initial_state
240-
result = gen.choice(x, size=10, replace=replace, p=p)
253+
result = gen.choice(x, size=10, replace=replace)
241254
assert_array_equal(result, expected)
242255

243256

244-
@pytest.mark.xfail(reason="Changes to lemire generators")
257+
@pytest.mark.skipif(NP_LT_1174, reason="Changes to lemire generators")
245258
@pytest.mark.parametrize("args", integers())
246259
def test_integers(args):
247260
np_gen.bit_generator.state = initial_state
248261
expected = np_gen.integers(*args)
249262

250263
gen.bit_generator.state = initial_state
251-
with pytest.deprecated_call():
252-
result = gen.integers(*args, use_masked=False)
264+
result = gen.integers(*args, use_masked=False)
253265
assert_array_equal(result, expected)
254266

255267

0 commit comments

Comments
 (0)