-
Notifications
You must be signed in to change notification settings - Fork 1
/
Utils.hpp
243 lines (206 loc) · 7.56 KB
/
Utils.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#pragma once
#include <stdlib.h>
//#include <emscripten/bind.h>
//#include <emscripten/val.h>
#ifdef OMP
#include <omp.h>
#endif
#include <memory>
#include <thread>
#include <iostream>
#define L2_CACHE_SIZE (16 * 1024 * 1024)
//#include <sanitizer/lsan_interface.h>
//typedef emscripten::val em_val;
template <typename T, typename op>
void hybrid_loop(T end, op operation)
{
auto operation_wrapper = [&](T i, int tid = 0)
{
if constexpr (std::is_invocable_v<op, T>) operation(i);
else operation(i, tid);
};
#ifdef SINGLE
for (T i = 0; i < end; ++i) operation_wrapper(i);
#elif OMP
#pragma omp parallel for
for (T i = 0; i < end; ++i) operation_wrapper(i, omp_get_thread_num());
#elif defined __EMSCRIPTEN_THREADS__ || defined MYLOOP
const int num_threads = std::thread::hardware_concurrency();
// Split in block equally for each thread. ex: 3 threads, start = 0, end = 8
// Thread 0: 0,1,2
// Thread 1: 3,4,5
// Thread 2: 6,7
// Also don't spawn more threads than needed
// ex: 4 threads, start = 0, end = 3
// Thread 0: 0
// Thread 1: 1
// Thread 2: 2
// Thread 3: NOT SPAWNED
const T block_size = (end + num_threads - 1) / num_threads;
std::vector<std::thread> threads;
const int threads_needed = std::min(num_threads, (int)std::ceil(end / (float)block_size));
for (int tid = 0; tid < threads_needed; ++tid)
{
threads.emplace_back([=]() {
T block_start = tid * block_size;
T block_end = (tid == threads_needed - 1) ? end : block_start + block_size;
for (T i = block_start; i < block_end; ++i) operation_wrapper(i, tid);});
}
for (auto &thread : threads) thread.join();
#endif
}
#define MALLOC_V4SF_ALIGNMENT 64
static void* Valigned_malloc(size_t nb_bytes) {
void* p, * p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
if (!p0) return (void*)0;
p = (void*)(((size_t)p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t)(MALLOC_V4SF_ALIGNMENT - 1))));
*((void**)p - 1) = p0;
return p;
}
static void Valigned_free(void* p) {
if (p) free(*((void**)p - 1));
}
template <class T>
class PFAlloc {
public:
// type definitions
typedef T value_type;
typedef T* pointer;
typedef const T* const_pointer;
typedef T& reference;
typedef const T& const_reference;
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
// rebind allocator to type U
template <class U>
struct rebind {
typedef PFAlloc<U> other;
};
// return address of values
pointer address(reference value) const {
return &value;
}
const_pointer address(const_reference value) const {
return &value;
}
/* constructors and destructor
* - nothing to do because the allocator has no state
*/
PFAlloc() throw() {
}
PFAlloc(const PFAlloc&) throw() {
}
template <class U>
PFAlloc(const PFAlloc<U>&) throw() {
}
~PFAlloc() throw() {
}
// return maximum number of elements that can be allocated
size_type max_size() const throw() {
return std::numeric_limits<std::size_t>::max() / sizeof(T);
}
// allocate but don't initialize num elements of type T
pointer allocate(size_type num, const void* = 0) {
pointer ret = (pointer)Valigned_malloc(int(num) * sizeof(T));
return ret;
}
// initialize elements of allocated storage p with value value
void construct(pointer p, const T& value) {
// initialize memory with placement new
new((void*)p)T(value);
}
// destroy elements of initialized storage p
void destroy(pointer p) {
// destroy objects by calling their destructor
p->~T();
}
// deallocate storage p of deleted elements
void deallocate(pointer p, size_type num) {
// deallocate memory with pffft
Valigned_free((void*)p);
}
};
// Utils from pffft to check the nearest efficient transform size of FFT
int isValidSize(int N) {
const int N_min = 32;
int R = N;
while (R >= 5 * N_min && (R % 5) == 0) R /= 5;
while (R >= 3 * N_min && (R % 3) == 0) R /= 3;
while (R >= 2 * N_min && (R % 2) == 0) R /= 2;
return (R == N_min) ? 1 : 0;
}
int nearestTransformSize(int N) {
const int N_min = 32;
if (N < N_min) N = N_min;
N = N_min * ((N + N_min - 1) / N_min);
while (!isValidSize(N)) N += N_min;
return N;
}
template<typename T, typename U>
void deinterleave_BGR(const T* const interleaved_BGR, U** const deinterleaved_BGR, const uint32_t total_size) {
// Cache-friendly deinterleave BGR, splitting for blocks of 256 KB, inspired by flip-block
constexpr float round = std::is_integral_v<U> ? std::is_integral_v<T> ? 0 : 0.5f : 0;
constexpr uint32_t block = L2_CACHE_SIZE / (3 * std::max(sizeof(T), sizeof(U)));
const uint32_t num_blocks = std::ceil(total_size / (float)block);
const uint32_t last_block_size = total_size % block == 0 ? block : total_size % block;
hybrid_loop(num_blocks, [&](auto n) {
const uint32_t x = n * block;
U* const B = deinterleaved_BGR[0] + x;
U* const G = deinterleaved_BGR[1] + x;
U* const R = deinterleaved_BGR[2] + x;
const T* const interleaved_ptr = interleaved_BGR + x * 3;
const int blockx = (n == num_blocks - 1) ? last_block_size : block;
for (int xx = 0; xx < blockx; ++xx)
{
B[xx] = interleaved_ptr[xx * 3 + 0] + round;
G[xx] = interleaved_ptr[xx * 3 + 1] + round;
R[xx] = interleaved_ptr[xx * 3 + 2] + round;
}
});
}
template<typename T, typename U>
void interleave_BGR(const U** const deinterleaved_BGR, T* const interleaved_BGR, const uint32_t total_size) {
constexpr float round = std::is_integral_v<T> ? std::is_integral_v<U> ? 0 : 0.5f : 0;
constexpr uint32_t block = L2_CACHE_SIZE / (3 * std::max(sizeof(T), sizeof(U)));
const uint32_t num_blocks = std::ceil(total_size / (float)block);
const uint32_t last_block_size = total_size % block == 0 ? block : total_size % block;
hybrid_loop(num_blocks, [&](auto n) {
const uint32_t x = n * block;
const U* const B = deinterleaved_BGR[0] + x;
const U* const G = deinterleaved_BGR[1] + x;
const U* const R = deinterleaved_BGR[2] + x;
T* const interleaved_ptr = interleaved_BGR + x * 3;
const int blockx = (n == num_blocks - 1) ? last_block_size : block;
for (int xx = 0; xx < blockx; ++xx)
{
interleaved_ptr[xx * 3 + 0] = B[xx] + round;
interleaved_ptr[xx * 3 + 1] = G[xx] + round;
interleaved_ptr[xx * 3 + 2] = R[xx] + round;
}
});
}
template<typename T, int C>
void Reflect_101(const T* const input, T* output, int pad_top, int pad_bottom, int pad_left, int pad_right, const int* original_size) {
// This function padd a 2D matrix or a multichannel image with the specified top,bottom,left,right pad and it applies
// a reflect 101 like cv::copyMakeBorder, the main (and only) difference is the following constraint to prevent out of buffer reading
pad_top = std::min(pad_top, original_size[0] - 1);
pad_bottom = std::min(pad_bottom, original_size[0] - 1);
pad_left = std::min(pad_left, original_size[1] - 1);
pad_right = std::min(pad_right, original_size[1] - 1);
const int stride[2] = { original_size[0], original_size[1] * C };
const int padded[2] = { stride[0] + pad_top + pad_bottom, stride[1] + (pad_left + pad_right) * C };
const int right_offset = (pad_left + original_size[1] - 1) * 2 * C;
const int left_offset = pad_left * 2 * C;
const int bottom_offset = 2 * (stride[0] - 1) + pad_top;
hybrid_loop(padded[0], [&](auto i){
T* const row = output + i * padded[1];
if (i < padded[0] - pad_bottom)
std::copy_n(&input[stride[1] * abs(i - pad_top)], stride[1], &row[pad_left * C]);
else
std::copy_n(&input[stride[1] * (bottom_offset - i)], stride[1], &row[pad_left * C]);
for (int j = 0; j < pad_left * C; j += C)
std::copy_n(row + left_offset - j, C, row + j);
for (int j = padded[1] - pad_right * C; j < padded[1]; j += C)
std::copy_n(row + right_offset - j, C, row + j);
});
}