@@ -570,11 +570,11 @@ cdef class Generator:
570
570
571
571
Parameters
572
572
----------
573
- low : int or array-like of ints
573
+ low : { int, array_like[int]}
574
574
Lowest (signed) integers to be drawn from the distribution (unless
575
575
``high=None``, in which case this parameter is one above the
576
576
*highest* such integer).
577
- high : int or array-like of ints , optional
577
+ high : { int, array_like[int]} , optional
578
578
If provided, one above the largest (signed) integer to be drawn
579
579
from the distribution (see above for behavior if ``high=None``).
580
580
If array-like, must contain integer values
@@ -648,10 +648,10 @@ cdef class Generator:
648
648
CoRR, Aug. 13, 2018, http://arxiv.org/abs/1805.10941.
649
649
650
650
"""
651
- if use_masked is not None :
651
+ if use_masked is not None and use_masked :
652
652
import warnings
653
653
warnings .warn ("use_masked will be removed in the final release and"
654
- "only the Lemire method will be available." ,
654
+ " only the Lemire method will be available." ,
655
655
DeprecationWarning )
656
656
if closed is not None :
657
657
import warnings
@@ -725,32 +725,34 @@ cdef class Generator:
725
725
return self .integers (0 , 4294967296 , size = n_uint32 , dtype = np .uint32 ).tobytes ()[:length ]
726
726
727
727
@cython .wraparound (True )
728
- def choice (self , a , size = None , replace = True , p = None , axis = 0 ):
728
+ def choice (self , a , size = None , replace = True , p = None , axis = 0 , bint shuffle = True ):
729
729
"""
730
- choice(a, size=None, replace=True, p=None, axis=0):
731
-
732
- Generates a random sample from a given 1-D array
733
-
734
- .. versionadded:: 1.7.0
730
+ choice(a, size=None, replace=True, p=None, axis=0, shuffle=True):
735
731
736
732
Parameters
737
733
----------
738
- a : 1-D array-like or int
734
+ a : {array_like, int}
739
735
If an ndarray, a random sample is generated from its elements.
740
736
If an int, the random sample is generated as if a were np.arange(a)
741
737
size : int or tuple of ints, optional
742
- Output shape. If the given shape is, e.g., ``(m, n, k)``, then
743
- ``m * n * k`` samples are drawn. Default is None, in which case a
744
- single value is returned.
738
+ Output shape. If the given shape is, e.g., ``(m, n, k)``, then
739
+ ``m * n * k`` samples are drawn from the 1-d `a`. If `a` has more
740
+ than one dimension, the `size` shape will be inserted into the
741
+ `axis` dimension, so the output ``ndim`` will be ``a.ndim - 1 +
742
+ len(size)``. Default is None, in which case a single value is
743
+ returned.
745
744
replace : boolean, optional
746
745
Whether the sample is with or without replacement
747
- p : 1-D array-like , optional
746
+ p : 1-D array_like , optional
748
747
The probabilities associated with each entry in a.
749
748
If not given the sample assumes a uniform distribution over all
750
749
entries in a.
751
750
axis : int, optional
752
751
The axis along which the selection is performed. The default, 0,
753
752
selects by row.
753
+ shuffle : boolean, optional
754
+ Whether the sample is shuffled when sampling without replacement.
755
+ Default is True, False provides a speedup.
754
756
755
757
Returns
756
758
-------
@@ -805,22 +807,20 @@ cdef class Generator:
805
807
dtype='<U11')
806
808
807
809
"""
808
- cdef char * idx_ptr
809
- cdef int64_t buf
810
- cdef char * buf_ptr
811
810
812
- cdef set idx_set
813
811
cdef int64_t val , t , loc , size_i , pop_size_i
814
812
cdef int64_t * idx_data
815
813
cdef np .npy_intp j
814
+ cdef uint64_t set_size , mask
815
+ cdef uint64_t [::1 ] hash_set
816
816
# Format and Verify input
817
817
a = np .array (a , copy = False )
818
818
if a .ndim == 0 :
819
819
try :
820
820
# __index__ must return an integer by python rules.
821
821
pop_size = operator .index (a .item ())
822
822
except TypeError :
823
- raise ValueError ("`a` must be 1-dimensional or an integer" )
823
+ raise ValueError ("`a` must an array or an integer" )
824
824
if pop_size <= 0 and np .prod (size ) != 0 :
825
825
raise ValueError ("`a` must be greater than 0 unless no samples are taken" )
826
826
else :
@@ -837,14 +837,17 @@ cdef class Generator:
837
837
atol = max (atol , np .sqrt (np .finfo (p .dtype ).eps ))
838
838
839
839
p = < np .ndarray > np .PyArray_FROM_OTF (p , np .NPY_DOUBLE , api .NPY_ARRAY_ALIGNED | api .NPY_ARRAY_C_CONTIGUOUS )
840
- check_array_constraint (p , "p" , CONS_BOUNDED_0_1 )
841
840
pix = < double * > np .PyArray_DATA (p )
842
841
843
842
if p .ndim != 1 :
844
843
raise ValueError ("`p` must be 1-dimensional" )
845
844
if p .size != pop_size :
846
845
raise ValueError ("`a` and `p` must have same size" )
847
846
p_sum = kahan_sum (pix , d )
847
+ if np .isnan (p_sum ):
848
+ raise ValueError ("probabilities contain NaN" )
849
+ if np .logical_or .reduce (p < 0 ):
850
+ raise ValueError ("probabilities are not non-negative" )
848
851
if abs (p_sum - 1. ) > atol :
849
852
raise ValueError ("probabilities do not sum to 1" )
850
853
@@ -863,7 +866,7 @@ cdef class Generator:
863
866
idx = cdf .searchsorted (uniform_samples , side = "right" )
864
867
idx = np .array (idx , copy = False , dtype = np .int64 ) # searchsorted returns a scalar
865
868
else :
866
- idx = self .integers (0 , pop_size , size = shape , dtype = np .int64 )
869
+ idx = self .integers (0 , pop_size , size = shape , dtype = np .int64 , use_masked = False )
867
870
else :
868
871
if size > pop_size :
869
872
raise ValueError ("Cannot take a larger sample than "
@@ -879,7 +882,7 @@ cdef class Generator:
879
882
found = np .zeros (shape , dtype = np .int64 )
880
883
flat_found = found .ravel ()
881
884
while n_uniq < size :
882
- x = self .random (size - n_uniq )
885
+ x = self .random (( size - n_uniq ,) )
883
886
if n_uniq > 0 :
884
887
p [flat_found [0 :n_uniq ]] = 0
885
888
cdf = np .cumsum (p )
@@ -895,36 +898,46 @@ cdef class Generator:
895
898
size_i = size
896
899
pop_size_i = pop_size
897
900
# This is a heuristic tuning. should be improvable
898
- if pop_size_i > 200 and (size > 200 or size > (10 * pop_size // size )):
901
+ if shuffle :
902
+ cutoff = 50
903
+ else :
904
+ cutoff = 20
905
+
906
+ if pop_size_i > 10000 and (size_i > (pop_size_i // cutoff )):
899
907
# Tail shuffle size elements
900
- idx = np .arange ( pop_size , dtype = np .int64 )
901
- idx_ptr = np .PyArray_BYTES (< np .ndarray > idx )
902
- buf_ptr = < char * > & buf
903
- self ._shuffle_raw (pop_size_i , max (pop_size_i - size_i , 1 ),
904
- 8 , 8 , idx_ptr , buf_ptr )
908
+ idx = np .PyArray_Arange ( 0 , pop_size_i , 1 , np .NPY_INT64 )
909
+ idx_data = < int64_t * > np .PyArray_DATA (< np .ndarray > idx )
910
+ with self . lock , nogil :
911
+ self ._shuffle_int (pop_size_i , max (pop_size_i - size_i , 1 ),
912
+ idx_data )
905
913
# Copy to allow potentially large array backing idx to be gc
906
914
idx = idx [(pop_size - size ):].copy ()
907
915
else :
908
- # Floyds's algorithm with precomputed indices
909
- # Worst case, O(n**2) when size is close to pop_size
916
+ # Floyd's algorithm
910
917
idx = np .empty (size , dtype = np .int64 )
911
918
idx_data = < int64_t * > np .PyArray_DATA (< np .ndarray > idx )
912
- idx_set = set ()
913
- loc = 0
914
- # Sample indices with one pass to avoid reacquiring the lock
915
- with self .lock :
916
- for j in range (pop_size_i - size_i , pop_size_i ):
917
- idx_data [loc ] = random_interval (& self ._bitgen , j )
918
- loc += 1
919
- loc = 0
920
- while len (idx_set ) < size_i :
919
+ # smallest power of 2 larger than 1.2 * size
920
+ set_size = < uint64_t > (1.2 * size_i )
921
+ mask = _gen_mask (set_size )
922
+ set_size = 1 + mask
923
+ hash_set = np .full (set_size , < uint64_t > - 1 , np .uint64 )
924
+ with self .lock , cython .wraparound (False ), nogil :
921
925
for j in range (pop_size_i - size_i , pop_size_i ):
922
- if idx_data [loc ] not in idx_set :
923
- val = idx_data [loc ]
924
- else :
925
- idx_data [loc ] = val = j
926
- idx_set .add (val )
927
- loc += 1
926
+ val = random_bounded_uint64 (& self ._bitgen , 0 , j , 0 , 0 )
927
+ loc = val & mask
928
+ while hash_set [loc ] != < uint64_t > - 1 and hash_set [loc ] != < uint64_t > val :
929
+ loc = (loc + 1 ) & mask
930
+ if hash_set [loc ] == < uint64_t > - 1 : # then val not in hash_set
931
+ hash_set [loc ] = val
932
+ idx_data [j - pop_size_i + size_i ] = val
933
+ else : # we need to insert j instead
934
+ loc = j & mask
935
+ while hash_set [loc ] != < uint64_t > - 1 :
936
+ loc = (loc + 1 ) & mask
937
+ hash_set [loc ] = j
938
+ idx_data [j - pop_size_i + size_i ] = j
939
+ if shuffle :
940
+ self ._shuffle_int (size_i , 1 , idx_data )
928
941
if shape is not None :
929
942
idx .shape = shape
930
943
@@ -3908,7 +3921,7 @@ cdef class Generator:
3908
3921
3909
3922
Parameters
3910
3923
----------
3911
- n : int or array-like of ints
3924
+ n : { int, array_like[int]}
3912
3925
Number of experiments.
3913
3926
pvals : sequence of floats, length p
3914
3927
Probabilities of each of the ``p`` different outcomes. These
@@ -4286,6 +4299,28 @@ cdef class Generator:
4286
4299
string .memcpy (data + j * stride , data + i * stride , itemsize )
4287
4300
string .memcpy (data + i * stride , buf , itemsize )
4288
4301
4302
+ cdef inline void _shuffle_int (self , np .npy_intp n , np .npy_intp first ,
4303
+ int64_t * data ) nogil :
4304
+ """
4305
+ Parameters
4306
+ ----------
4307
+ n
4308
+ Number of elements in data
4309
+ first
4310
+ First observation to shuffle. Shuffles n-1,
4311
+ n-2, ..., first, so that when first=1 the entire
4312
+ array is shuffled
4313
+ data
4314
+ Location of data
4315
+ """
4316
+ cdef np .npy_intp i , j
4317
+ cdef int64_t temp
4318
+ for i in reversed (range (first , n )):
4319
+ j = random_bounded_uint64 (& self ._bitgen , 0 , i , 0 , 0 )
4320
+ temp = data [j ]
4321
+ data [j ] = data [i ]
4322
+ data [i ] = temp
4323
+
4289
4324
def permutation (self , object x ):
4290
4325
"""
4291
4326
permutation(x)
0 commit comments