-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathsimdxorshift128plus.c
124 lines (116 loc) · 4.06 KB
/
simdxorshift128plus.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#include <stdio.h>
#include "simdxorshift128plus.h"
/* used by xorshift128plus_jump_onkeys */
static void xorshift128plus_onkeys(uint64_t * ps0, uint64_t * ps1) {
uint64_t s1 = *ps0;
const uint64_t s0 = *ps1;
*ps0 = s0;
s1 ^= s1 << 23; // a
*ps1 = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
}
/* used by avx_xorshift128plus_init */
static void xorshift128plus_jump_onkeys(uint64_t in1, uint64_t in2,
uint64_t * output1, uint64_t * output2) {
/* todo: streamline */
static const uint64_t JUMP[] = { 0x8a5cd789635d2dff, 0x121fd2155c472f96 };
uint64_t s0 = 0;
uint64_t s1 = 0;
for (unsigned int i = 0; i < sizeof(JUMP) / sizeof(*JUMP); i++)
for (int b = 0; b < 64; b++) {
if (JUMP[i] & 1ULL << b) {
s0 ^= in1;
s1 ^= in2;
}
xorshift128plus_onkeys(&in1, &in2);
}
output1[0] = s0;
output2[0] = s1;
}
void avx_xorshift128plus_init(uint64_t key1, uint64_t key2,
avx_xorshift128plus_key_t *key) {
uint64_t S0[4];
uint64_t S1[4];
S0[0] = key1;
S1[0] = key2;
xorshift128plus_jump_onkeys(*S0, *S1, S0 + 1, S1 + 1);
xorshift128plus_jump_onkeys(*(S0 + 1), *(S1 + 1), S0 + 2, S1 + 2);
xorshift128plus_jump_onkeys(*(S0 + 2), *(S1 + 2), S0 + 3, S1 + 3);
key->part1 = _mm256_loadu_si256((const __m256i *) S0);
key->part2 = _mm256_loadu_si256((const __m256i *) S1);
}
/*
Return a 256-bit random "number"
*/
__m256i avx_xorshift128plus(avx_xorshift128plus_key_t *key) {
__m256i s1 = key->part1;
const __m256i s0 = key->part2;
key->part1 = key->part2;
s1 = _mm256_xor_si256(key->part2, _mm256_slli_epi64(key->part2, 23));
key->part2 = _mm256_xor_si256(
_mm256_xor_si256(_mm256_xor_si256(s1, s0),
_mm256_srli_epi64(s1, 18)), _mm256_srli_epi64(s0, 5));
return _mm256_add_epi64(key->part2, s0);
}
/**
* equivalent to skipping 2^64 avx_xorshift128plus() calls
* useful to generate a new key from an existing one (multi-threaded context).
*/
void avx_xorshift128plus_jump(avx_xorshift128plus_key_t * key) {
uint64_t S0[4];
uint64_t S1[4];
S0[0] = _mm256_extract_epi64(key->part1, 3);
S1[0] = _mm256_extract_epi64(key->part2, 3);
xorshift128plus_jump_onkeys(*S0, *S1, S0, S1);
xorshift128plus_jump_onkeys(*S0, *S1, S0 + 1, S1 + 1);
xorshift128plus_jump_onkeys(*(S0 + 1), *(S1 + 1), S0 + 2, S1 + 2);
xorshift128plus_jump_onkeys(*(S0 + 2), *(S1 + 2), S0 + 3, S1 + 3);
key->part1 = _mm256_loadu_si256((const __m256i *) S0);
key->part2 = _mm256_loadu_si256((const __m256i *) S1);
}
/**
* Given 8 random 32-bit integers in randomvals,
* derive 8 random 32-bit integers that are less than equal
* than the 32-bit integers in upperbound using the
* following heuristic:
*
* ( randomval * bound ) >> 32
*
* This approach generates a very slight bias, but if you
* are using an xorshift random number generator, it is probably
* the last of concerns.
*
*/
static __m256i avx_randombound_epu32(__m256i randomvals, __m256i upperbound) {
/* four values */
__m256i evenparts = _mm256_srli_epi64(
_mm256_mul_epu32(randomvals, upperbound), 32);
/* four other values */
__m256i oddparts = _mm256_mul_epu32(_mm256_srli_epi64(randomvals, 32),
upperbound);
/* note:shift could be replaced by shuffle */
/* need to blend the eight values */
return _mm256_blend_epi32(evenparts, oddparts, 0b10101010);
}
void avx_xorshift128plus_shuffle32(avx_xorshift128plus_key_t *key,
uint32_t *storage, uint32_t size) {
uint32_t i;
uint32_t randomsource[8];
__m256i interval = _mm256_setr_epi32(size, size - 1, size - 2, size - 3,
size - 4, size - 5, size - 6, size - 7);
__m256i R = avx_randombound_epu32(avx_xorshift128plus(key), interval);
_mm256_storeu_si256((__m256i *) randomsource, R);
__m256i vec8 = _mm256_set1_epi32(8);
for (i = size; i > 1;) {
for (int j = 0; j < 8; ++j) {
uint32_t nextpos = randomsource[j];
int tmp = storage[i - 1]; // likely in cache
int val = storage[nextpos]; // could be costly
storage[i - 1] = val;
storage[nextpos] = tmp; // you might have to read this store later
i--;
}
interval = _mm256_sub_epi32(interval, vec8);
R = avx_randombound_epu32(avx_xorshift128plus(key), interval);
_mm256_storeu_si256((__m256i *) randomsource, R);
}
}