-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathintutil.h
140 lines (129 loc) · 4.57 KB
/
intutil.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#ifndef INT_UTIL_H
#define INT_UTIL_H
/* Copyright (c) 2019 by NEC Corporation
* This file is part of ve-jit */
#include <stdint.h>
#ifdef __cplusplus
extern "C" { //}
#if __cplusplus >= 201103L
#define FASTDIV_INLINE inline constexpr
#else
#define FASTDIV_INLINE inline
#endif
#else
#define FASTDIV_INLINE inline
#endif
extern int const bitcount[256];
extern int const multiplyDeBruijnBitPosition2[32];
/// \defgroup integer arithmetic
//@{
/** divide-by-\c _odiv via mul-add-shift */
struct fastdiv {
uint32_t mul;
uint32_t add;
int32_t shift;
uint32_t _odiv; /* save original divisor for modulo calc */
};
/** \b if positivePow2(v) && v=2**N for N>0, \b then we return N. */
inline int /*constexpr*/ positivePow2Shift(uint32_t v) {
//assert( positivePow2(v)
return multiplyDeBruijnBitPosition2[(uint32_t)(v * 0x077CB531U) >> 27];
}
/** unsigned 32-bit log2, required for magic constant generation */
inline uint32_t ulog2(uint32_t v) {
uint32_t r, shift;
r = (v > 0xFFFF) << 4; v >>= r;
shift = (v > 0xFF ) << 3; v >>= shift; r |= shift;
shift = (v > 0xF ) << 2; v >>= shift; r |= shift;
shift = (v > 0x3 ) << 1; v >>= shift; r |= shift;
r |= (v >> 1);
return r;
}
/** generate constants for implementing a division with multiply-add-shift.
* \b MODIFIED for VE JIT, so shift is 0 to 64.
*
* - x86 shift was 32+N, nice for x86 ops.
* - VE does better with a single shift
*/
void fastdiv_make(struct fastdiv *d, uint32_t divisor);
//@}
/// \defgroup fastdiv64
/// fast division for 'small enough' numbers,
/// without intermediates overflowing 64 bits.
/// see https://github.com/lemire/constantdivisionbenchmarks, "lkk" algorithm.
/// Issue #1 comments on 64-bit-only version similar to:
///@{
#define FASTDIV_B 21 /* FASTDIV_B up to 21 is OK to keep all intermediates in 64-bit range */
/** *_uB functions were verified for its FASTDIV_SAFEMAX (took a while to run).
* computeM_uB is two ops: divide + increment. This might be fast enough
* to precalculate routinely at runtime for use in loops. */
#define FASTDIV_SAFEMAX ((1U<<FASTDIV_B)-1U)
#define FASTDIV_C (2*FASTDIV_B)
/* 23 zeros and rest (41 = FASTDIV_C-1) ones */
#define FASTDIV_CMASK ((UINT64_C(1)<<FASTDIV_C)-1)
FASTDIV_INLINE uint64_t computeM_uB(uint32_t d) {
return FASTDIV_CMASK / d + 1;
}
/** fastdiv_uB computes (a / d) given precomputed M for d>1.
* \pre a,d < (1<<21). */
FASTDIV_INLINE uint32_t fastdiv_uB(uint32_t const a, uint64_t const M) {
return M*a >> FASTDIV_C; // 2 ops: mul, shr
}
/** fastmod_uB computes \c (a%d) given precomputed \c M.
* \c M is a pre-computed function of \c d, from \c computeM_uB(d)
* Probably nicer to calc. uint64_t D=a/d using fastdiv_uB, and then R=a-D*d.
* (retain "everythin" in u64 registers).
* \pre \c a and \c d < (1<<21).
* When d is a power of \f$(v\&(v-1)) == 0\f$, just use shifts, \ref positivePow2.
*/
FASTDIV_INLINE uint32_t fastmod_uB(uint32_t const a, uint64_t const M, uint32_t const d) {
// Lemire original
//uint64_t lowbits = M * a;
//return ((__uint128_t)lowbits * d) >> 64;
// lowbits version
#if 0
// ugly with 'lowbits' approach, nicer to just do the fastdiv followed by mul and sub :(
// HOWEVER, in JIT code VE has the mask as an immediate constant!
uint64_t lowbits = (M * a) & FASTDIV_CMASK;
return (lowbits * d) >> FASTDIV_C; // ops: mul, and, mul, shr [4 ops]
// divmod would be like:
// i = M*a;
// div = i>>C;
// j = i & C_MASK; // in parellel w/ prev op
// k = j * d;
// mod = k>>C;
#endif
// 3-4 ops: mul, shr, mul-sub... AND get a/d too
// (VE has no fused mul-sub for unsigned, though, so 4 ops
return a - (M*a>>FASTDIV_C)*d;
// divmod would be like:
// i = M*a;
// div = i>>C;
// j = div *d;
// mod = a - j;
// so fewer ops, still 2 multiplies
}
/** Is \c a%d==0 ?
* \p a numerator
* \p M is \c compute_M(d)
* \pre \c a and \c d < (1<<21).
* If \c a is known to be a power-of-two, use \c (a&(a-1)==0).
* Especially nice for VE because it avoids rotation (no rotate op).
*/
FASTDIV_INLINE int fastdivisible_uB(uint32_t const a, uint64_t const M) {
return ((M*a) & FASTDIV_CMASK) <= M-1;
// obviously much faster than the naive way using fastmod:
// return (a - ((M*a)>>C)*d) == 0;
// ----fastmod_uB----
}
//#undef FASTDIV_CMASK
//#undef FASTDIV_C
//#undef FASTDIV_SAFEMAX
//#undef FASTDIV_B
//@}
#undef FASTDIV_INLINE
#ifdef __cplusplus //{
}//extern "C"
#endif
// vim: ts=4 sw=4 et cindent cino=^=l0,\:.5s,=-.5s,N-s,g.5s,b1 cinkeys=0{,0},0),\:,0#,!^F,o,O,e,0=break
#endif //INT_UTIL_H