-
Notifications
You must be signed in to change notification settings - Fork 3
/
blazingio.hpp
1427 lines (1334 loc) · 51 KB
/
blazingio.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include <array>
!ifdef BITSET
#include <bitset>
!endif
!ifdef COMPLEX
#include <complex>
!endif
#include <cstring>
@include
@case *-x86+avx2,*-x86+sse4.1 <immintrin.h>
@case *-aarch64+neon <arm_neon.h>
@case *-x86+none,*-aarch64+none none
@end
@ondemand windows-*
#include <stdint.h>
@end
@include
@case linux-*,macos-* <sys/mman.h>
@case windows-* <windows.h>
@end
!ifdef INTERACTIVE
#include <sys/stat.h>
!endif
@include
@case linux-*,macos-* <unistd.h>
@case windows-* <io.h>
@end
@include
@case linux-* <sys/resource.h>
@case windows-* none
@end
!define UNSET_SIMD
@ondemand *-x86+avx2,*-x86+sse4.1
!undef UNSET_SIMD
!define UNSET_SIMD #define SIMD
@end
@ondemand windows-*
#if _MSC_VER
#define __builtin_add_overflow(a, b, c) _addcarry_u64(0, a, b, c)
UNSET_SIMD
#else
@end
@match
@case *-x86_64,*-aarch64
uint64_t _umul128(uint64_t a, uint64_t b, uint64_t* high) {
auto x = (__uint128_t)a * b;
*high = uint64_t(x >> 64);
return (uint64_t)x;
}
@case *-i386
@end
@define SIMD
@case *-x86+avx2 __attribute__((target("avx2")))
@case *-x86+sse4.1 __attribute__((target("sse4.1")))
@case *-x86+none,*-aarch64
@end
@ondemand windows-*
#endif
@end
@define SIMD_SIZE
@case *-x86+avx2 32
@case *-x86+sse4.1,*-aarch64+neon 16
@case *-x86+none,*-aarch64+none 8
@end
@define #SIMD_TYPE
@case *-x86+avx2 __m256i
@case *-x86+sse4.1 __m128i
@case *-aarch64+neon uint8x16_t
@case *-x86+none,*-aarch64+none uint64_t
@end
// Needs to be unsigned for SWAR code
@define !SIMD_TYPE_OVER_EIGHT
@case *-x86+avx2 int
@case *-x86+sse4.1 short
@case *-aarch64+neon short
@case *-x86+none,*-aarch64+none uint8_t
@end
// This is ridiculous but necessary for clang codegen to be at least somewhat reasonable --
// otherwise it resorts to way too many memory accesses. XXX: is this necessary on MSVC?
// MinGW eats up __forceinline just fine
@define INLINE
@case linux-*,macos-* __attribute__((always_inline))
@case windows-* __forceinline
@end
!ifdef INTERACTIVE
#define FETCH fetch(),
!else
!define FETCH
!endif
#define ensure(x) if (!(x)) abort();
@match
@case linux-*,macos-*
@case windows-*
LONG WINAPI vectored_exception_handler(_EXCEPTION_POINTERS*);
@end
// Using unary minus on unsigned numbers triggers an MSVC warning. The fix is rather simple, but
// I'd rather it was isolated so that it could be rolled back if MSVC zero-warning support is
// dropped.
!define NEGATE_MAYBE_UNSIGNED(x) -x
@ondemand windows-*
!undef NEGATE_MAYBE_UNSIGNED
!define NEGATE_MAYBE_UNSIGNED(x) 1 + ~x
@end
namespace blazingio {
using namespace std;
struct NonAliasingChar {
enum class Inner : char {} c;
NonAliasingChar& operator=(char x) {
c = Inner{x};
return *this;
}
operator char() {
return (char)c;
}
};
constexpr uint64_t ONE_BYTES = ~0ULL / 255
!ifdef BITSET
@ondemand *-x86+none,*-aarch64+none
, BITSET_SHIFT = 0x8040201008040201
@end
!endif
;
// Actually 0x0102040810204080
!define POWERS_OF_TWO ~2ULL / 254
struct line_t {
string& value;
};
!ifndef INTERACTIVE
!define istream_impl blazingio_istream
!endif
!ifdef INTERACTIVE
// Allocate one more byte for null terminator as used by parsing routines. We might want to
// lookahead over this byte though, so add 32 instead of 1.
static NonAliasingChar buffer[65568];
template<int Interactive>
!endif
struct istream_impl {
NonAliasingChar *end, *ptr;
!ifdef INTERACTIVE
void init_assume_file(off_t file_size) {
!else
blazingio_istream() {
off_t file_size =
// Windows works just fine with lseek, but MSVC throws a warning we'd like to avoid.
// interactive=n is a rare case, so why not?
@match
@case linux-*,macos-*
lseek
@case windows-*
_lseek
@end
(STDIN_FILENO, 0, SEEK_END);
ensure(~file_size)
!endif
@match
@case windows-*
// Windows is a mess. With allocation granularity 64k and page size 4k we don't always have
// the option of mapping a zero page immediately after contents. For instance, mapping a 32k
// file will leave the second half of the 64k section unmapped, triggering a page fault upon
// access without letting us to map the page right. Therefore, we map the file and one more
// page; this both gives us free space to work with and guarantees at most 64k bytes trap.
// Aligning the sizes to 64k, we then remap the last 64k with rw memory and read it from
// file. This is a mix of mmap-based file handling and read-based file handling and is
// hopefully more efficient than a pure read-based method.
// Find free space
char* base = (char*)VirtualAlloc(NULL, (file_size + 8191) & -4096, MEM_RESERVE, PAGE_NOACCESS);
ensure(base)
ensure(VirtualFree(base, 0, MEM_RELEASE))
// Map the file there
DWORD mmaped_region_size = file_size & -65536;
ensure(
// If we remove this if and always call CreateFileMapping, it's going to interpret 0 as
// "max", which we don't want.
!mmaped_region_size
|| MapViewOfFileEx(
CreateFileMapping(
GetStdHandle(STD_INPUT_HANDLE),
NULL,
PAGE_READONLY,
// XXX: This assumes the file fits in ~4 GB by putting the size in the low
// DWORD only
0,
mmaped_region_size,
NULL
),
FILE_MAP_READ,
0,
0,
0,
base
) == base
)
// Read into the start of a 64k region
ensure(
VirtualAlloc(base + mmaped_region_size, 65536, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE)
== base + mmaped_region_size
)
ensure(~_lseek(STDIN_FILENO, mmaped_region_size, SEEK_SET))
DWORD tmp_n_read = 0;
ReadFile(GetStdHandle(STD_INPUT_HANDLE), base + mmaped_region_size, 65536, &tmp_n_read, NULL);
@case linux-*,macos-*
// We expect a zero byte soon after EOF. On Linux, man mmap(2) says:
// A file is mapped in multiples of the page size. For a file that is not a multiple of
// the page size, the remaining bytes in the partial page at the end of the mapping are
// zeroed when mapped, and modifications to that region are not written out to the file.
// This is not the whole truth: if the file has previously been mapped with MAP_SHARED,
// modifications to the few bytes after EOF are saved in the shared page and visible even to
// processes that map the file with MAP_PRIVATE. Therefore assuming the rest of the last
// page is zero-filled is unreliable. To prevent that, we mmap the file read-write and
// explicitly zero the byte after EOF.
// Various functions assume at least a few bytes after EOF are readable. For instance,
// that's what vectorized implementations expect. Round that up to page size for simplicity
// because we're going to map an anonymous page anyway. This also enables bitset to work
// more efficiently for bitsets of size up to 4095 on all architectures.
int page_size = getpagesize();
char* base = (char*)mmap(NULL, file_size + page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, STDIN_FILENO, 0);
ensure(base != MAP_FAILED)
// Remap the last page from anonymous mapping to avoid SIGBUS
ensure(mmap(base + ((file_size + page_size - 1) & -page_size), page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) != MAP_FAILED)
@end
end = (NonAliasingChar*)base + file_size;
// Handle attempts to read beyond EOF of stdin in input(). The right thing to do is to stop
// the loop by encountering a space character. \n is chosen instead of \0 so that getline
// can detect EOL by scanning for \n and \r\n without caring about \0.
*end = '\n';
!ifdef STDIN_EOF
// Handle attempts to read beyond EOF of stdin after it was reached. We stop the loop in
// skip_whitespace() by encountering a non-space character, and then stop the input() called
// after that by a space character. "0\0" works just fine and doesn't lead to UB for any
// type.
end[1] = '0';
end[2] = 0;
!endif
ptr = (NonAliasingChar*)base;
}
!ifdef INTERACTIVE
void init_assume_interactive() {
end = ptr = buffer;
}
!endif
!ifndef INTERACTIVE
// For people writing cin.tie(0);
istream_impl* tie(nullptr_t) {
return this;
}
// For people writing cin.tie(0)->sync_with_stdio(0);
void sync_with_stdio(bool) {}
!endif
!ifdef INTERACTIVE
INLINE void fetch() {
if (Interactive && ptr == end) {
!ifdef HOIST_GLOBALS_ON_INTERACTIVE_INPUT
// There's a bit of ridiculous code with questionable choices below. What we *want* is:
// off_t n_read = read(STDIN_FILENO, buffer, 65536);
// Unfortunately, read() is an external call, which means it can override globals. Even
// though blazingio_cin is static, there's no guarantee read() doesn't map to a symbol
// from the same translation unit, so GCC assumes read() may read or modify ptr or end.
// This causes it to spill the values to memory in the hot loop, which is a bad-bad
// thing. Therefore we have to avoid the call to read() and roll our own inline
// assembly. The *obvious* way to write that is
// off_t n_read = SYS_read;
// asm volatile(
// "syscall"
// : "+a"(n_read)
// : "D"(STDIN_FILENO), "S"(buffer), "d"(65536)
// : "rcx", "r11", "memory"
// );
// ...but that suffers from the same consequences as the call to read() because of the
// memory clobber. The second-most obvious way to rewrite this is documented by GCC as
// off_t n_read = SYS_read;
// asm volatile(
// "syscall"
// : "+a"(n_read), "+m"(buffer)
// : "D"(STDIN_FILENO), "S"(buffer), "d"(65536)
// : "rcx", "r11"
// );
// ...which should theoretically be optimal because 'buffer' is of type
// NonAliasingChar[], meaning it can't alias with ptr and other fields. Unfortunately,
// that doesn't work *either* because of a missed optimization that hasn't been fixed
// for years (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63900), so we sort of
// have to resort to UB and compiler-specific optimizations here. We thus remove any
// mention of clobbering anything NonAliasingChar-related and simulate the behavior "as
// if" the buffer was *reallocated* on each read by telling GCC that the syscall might
// change 'ptr' (it actually won't, but GCC won't be able to misoptimize based on this):
// ptr = buffer;
// off_t n_read = SYS_read;
// asm volatile(
// "syscall"
// : "+a"(n_read), "+S"(ptr)
// : "D"(STDIN_FILENO), "d"(65536)
// : "rcx", "r11"
// );
// UNFORTUNATELY, this doesn't work *either* due to *yet another* missed optimization:
// even though ptr is clearly loaded into a register, GCC assumes memory might still be
// modified, so we have to load ptr into a local variable and then put it back, like
// this:
@define !SYSCALL_NO_REGISTER
@case linux-* "x8"
@case macos-* "x16"
@end
@define !SVC
@case linux-*
@case macos-* "x80"
@end
@define !INSN
@case *-x86_64 "syscall"
@case *-i386 "int $128"
@end
@define !ARGS
@case *-x86_64 "+S"(arg1) : "D"
@case *-i386 "+c"(arg1) : "b"
@end
@define !CLOBBERS
@case *-x86_64 UNWRAP(: "rcx", "r11")
@case *-i386
@end
@define !X86_SYS_read
@case linux-i386 3
@case linux-x86_64 0
@case macos-x86_64 0x2000003 /* This is not documented anywhere, but it's been this for dozens of years */
@end
@define !AARCH64_SYS_read
@case linux-aarch64 63
@case macos-aarch64 3
@end
!define UNIX_READ \
@match
@case *-x86 \
off_t n_read = X86_SYS_read; \
NonAliasingChar* arg1 = buffer; \
asm volatile( \
/* XXX: Handling errors here is complicated, because on Linux syscall will return
a small negative number in rax, leading to OOB write, while on XNU the syscall
will return a small positive number in rax and set a carry flag we ignore, making
it seem like we've just read a few bytes. Neither case is handled correctly, and
'ensure(n_read >= 0)' just hides the error, so let us explicitly state we don't
support errors returned from read(2) for now. This should probably be fixed
later. */ \
INSN \
: "+a"(n_read), ARGS(STDIN_FILENO), "d"(65536) \
CLOBBERS \
); \
ptr = arg1; \
@case *-aarch64 wrap
/* Linux: svc 0, syscall number in x8
Mac OS: svc 0x80, syscall number in x16 */ \
register long \
n_read asm("x0") = STDIN_FILENO, \
arg1 asm("x1") = (long)buffer, \
arg2 asm("x2") = 65536, \
syscall_no asm(SYSCALL_NO_REGISTER) = AARCH64_SYS_read; \
asm volatile( \
"svc 0" SVC \
: "+r"(n_read), "+r"(arg1) \
: "r"(syscall_no), "r"(arg2) \
); \
/* On XNU, x1 is overridden after syscall, so we can't rely on arg1 like
we did in x86 case. */ \
ptr = launder(buffer); \
@end
!else
!define UNIX_READ off_t n_read = read(STDIN_FILENO, ptr = buffer, 65536); ensure(~n_read)
!endif
@match
@case linux-*,macos-*
UNIX_READ
@case windows-*
DWORD n_read = 0;
ReadFile(GetStdHandle(STD_INPUT_HANDLE), ptr = buffer, 65536, &n_read, NULL);
@end
end = ptr + n_read;
// Reading strings is more efficient if we don't have to check that ptr < end on each
// iteration. Indeed, if we put whitespace after each chunk of data, string procedures
// can only check if ptr == end (and thus the next chunk has to be loaded) after the
// last iteration. We also need '\n' at EOF (i.e. after the last chunk) for getline(),
// and this kills two birds with one stone.
*end = '\n';
!ifdef STDIN_EOF
if (!n_read)
// Handle attempts to read beyond EOF of stdin after it was reached, just like with
// files. Be careful to use 'buffer' instead of 'ptr' here -- using the latter
// confuses GCC's optimizer for some reason.
buffer[1] = '0',
buffer[2] = 0;
!endif
}
}
!endif
template<typename T>
INLINE void collect_digits(T& x) {
while (FETCH (*ptr & 0xf0) == 0x30)
x = T(x * 10 + (*ptr++ - '0'));
}
template<typename T>
INLINE decltype((void)~T{1}) input(T& x) {
!ifdef INTERACTIVE
fetch();
!endif
int negative = is_signed_v<T> && *ptr == '-';
ptr += negative;
collect_digits(x = 0);
x = negative ? NEGATE_MAYBE_UNSIGNED(x) : x;
}
!ifdef FLOAT
template<typename T>
INLINE decltype((void)T{1.}) input(T& x) {
!ifdef INTERACTIVE
fetch();
!endif
int negative = *ptr == '-';
ptr += negative;
FETCH ptr += *ptr == '+';
uint64_t n = 0;
int i = 0;
for (; i < 18 && (FETCH *ptr & 0xf0) == 0x30; i++)
n = n * 10 + *ptr++ - '0';
int exponent = 20; // Offset by 20, for reasons
int has_dot = *ptr == '.';
ptr += has_dot;
for (; i < 18 && (FETCH *ptr & 0xf0) == 0x30; i++)
n = n * 10 + *ptr++ - '0',
exponent -= has_dot;
x = (T)n;
while ((FETCH *ptr & 0xf0) == 0x30)
x = x * 10 + *ptr++ - '0',
exponent -= has_dot;
if (*ptr == '.')
ptr++,
has_dot = true;
while ((FETCH *ptr & 0xf0) == 0x30)
x = x * 10 + *ptr++ - '0',
exponent -= has_dot;
int new_exponent;
if ((*ptr | 0x20) == 'e')
ptr++,
FETCH ptr += *ptr == '+',
input(new_exponent),
exponent += new_exponent;
// This generates {1e-20, 1e-14, ..., 1e14, 1e20}
static constexpr auto exps = []() {
array<T, 41> exps{};
T x = 1;
for (int i = 21; i--; )
exps[40 - i] = x,
exps[i] = 1 / x,
x *= 10;
return exps;
}();
while (exponent > 40)
x *= (T)1e10,
exponent -= 10;
while (exponent < 0)
x *= (T)1e-10,
exponent += 10;
x *= exps[exponent];
x = negative ? -x : x;
}
!endif
INLINE void input(bool& x) {
FETCH x = *ptr++ == '1';
}
INLINE void input(char& x) {
FETCH x = *ptr++;
}
!ifdef CHAR_WITH_SIGN_IS_GLYPH
INLINE void input(uint8_t& x) {
FETCH x = *ptr++;
}
INLINE void input(int8_t& x) {
FETCH x = *ptr++;
}
!endif
template<typename T>
SIMD void input_string_like(string& value, T trace) {
!ifdef INTERACTIVE
fetch();
!endif
NonAliasingChar* start = ptr;
trace();
// We know that [start; ptr) does not overlap 'value'. std::string::assign doesn't know that
// and will perform a runtime check to determine if it need to handle aliasing strings
// gracefully. This takes a bit of time, so we *used to* do the following instead:
// struct UninitChar { UninitChar& operator=(UninitChar) { return *this; } };
// ((basic_string<UninitChar>&)value).resize(ptr - start);
// memcpy(value.data(), start, ptr - start);
// This worked just fine, but libc++ forbids this code because UninitChar is not a trivial
// type. Therefore, disable this optimization.
value.assign((char*)start, ptr - start);
!ifdef INTERACTIVE
while (Interactive && ptr == end && (FETCH end != buffer)) {
// We have read *some* data, but stumbled upon an unfetched chunk and thus have to load
// more. We can't reuse the same code as we want to append to the string instead of
// replacing it.
// Abuse the fact that ptr points at buffer after a non-trivial fetch to avoid storing
// start.
trace();
value.append(buffer, ptr);
}
!endif
}
SIMD void input(string& value) {
input_string_like(value, [&]() SIMD {
// We expect long runs here, hence vectorization. Instrinsics break aliasing, and if we
// interleave ptr modification with SIMD loading, there's going to be an extra memory
// write on every iteration.
NonAliasingChar* p = ptr;
@match
@case linux-*,macos-*
@case windows-*
ULONG index;
@end
@define !BSFD(x)
@case linux-*,macos-* __builtin_ctz(x)
@case windows-* (_BitScanForward(&index, x), index)
@end
@define !BSFQ_64BIT(x)
@case linux-*,macos-* __builtin_ctzll(x)
@case windows-* (_BitScanForward64(&index, x), index)
@end
@define !BSFQ(x)
@case linux-*,macos-*,windows-x86_64,windows-aarch64 BSFQ_64BIT(x)
@case windows-i386 (_BitScanForward(&index, (ULONG)x) || (_BitScanForward(&index, ULONG(x >> 32)), index += 32), index)
@end
SIMD_TYPE x;
@match
@case *-x86+avx2
int mask;
SIMD_TYPE space = _mm256_set1_epi8(' ');
while (
memcpy(&x, p, 32),
!(mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(space, _mm256_max_epu8(space, x))))
)
p += 32;
ptr = p + BSFD(mask);
@case *-x86+sse4.1
int mask;
SIMD_TYPE space = _mm_set1_epi8(' ');
while (
memcpy(&x, p, 16),
!(mask = _mm_movemask_epi8(_mm_cmpeq_epi8(space, _mm_max_epu8(space, x))))
)
p += 16;
ptr = p + BSFD(mask);
@case *-aarch64+neon
uint64x2_t vec;
while (memcpy(&x, p, 16), vec = uint64x2_t(x < 33), !(vec[0] | vec[1]))
p += 16;
ptr = p + (vec[0] ? 0 : 8) + BSFQ_64BIT(vec[0] ? vec[0] : vec[1]) / 8;
@case *-x86+none,*-aarch64+none
// This is a variation on Mycroft's algorithm. See
// https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ for
// the original code.
uint64_t vec;
while (memcpy(&x, p, 8), !(vec = ((x - ONE_BYTES * 33) & ~x & (ONE_BYTES << 7))))
p += 8;
ptr = p + BSFQ(vec) / 8;
@end
});
}
SIMD void input(line_t& line) {
input_string_like(line.value, [&]() {
ptr = (NonAliasingChar*)memchr(ptr, '\n', end - ptr + 1);
});
if (line.value.size() && line.value.back() == '\r')
line.value.pop_back();
// Skip \n unless the terminator is part of EOF and we have read a non-empty string (so as
// not to trigger EOF after reading a non-terminated line)
if (line.value.empty() || ptr < end)
ptr += *ptr == '\n';
}
!ifdef COMPLEX
template<typename T>
INLINE void input(complex<T>& value) {
T real_part, imag_part{};
if (FETCH *ptr == '(') {
ptr++;
input(real_part);
if (FETCH *ptr++ == ',')
!ifdef INTERACTIVE
rshift_impl(imag_part),
!else
*this >> imag_part,
!endif
ptr++;
} else
input(real_part);
value = {real_part, imag_part};
}
!endif
!ifdef BITSET
template<size_t N>
SIMD void input(bitset<N>& value) {
!ifdef STDIN_EOF
// As we always read N bytes, we might read past the end of the file in case EOF happens.
// Luckily, we are allowed to overread up to 4095 bytes after EOF (because there's a
// 4096-page and its second byte is non-whitespace). Therefore, we only have to check for
// EOF for large enough N, and in this case the overhead is small enough.
if (N > 4095 && !*this)
return;
!endif
ptrdiff_t i = N;
!ifdef INTERACTIVE
while (i)
if (FETCH i % SIMD_SIZE || end - ptr < SIMD_SIZE)
value[--i] = *ptr++ == '1';
else {
!else
while (i % SIMD_SIZE)
value[--i] = *ptr++ == '1';
!endif
NonAliasingChar* p = ptr;
!ifdef INTERACTIVE
for (int64_t j = 0; j < min(i, end - ptr) / SIMD_SIZE; j++) {
!else
while (i) {
!endif
i -= SIMD_SIZE;
@define !BSWAP32
@case linux-*,macos-* __builtin_bswap32
@case windows-* _byteswap_ulong
@end
SIMD_TYPE x;
memcpy(&x, p, SIMD_SIZE);
@match
@case *-x86+avx2
// This is actually 0x0001020304050607
uint64_t a = ~0ULL / 65025;
auto y = BSWAP32(
_mm256_movemask_epi8(
_mm256_shuffle_epi8(
_mm256_slli_epi32(x, 7),
_mm256_set_epi64x(
a + ONE_BYTES * 24,
a + ONE_BYTES * 16,
a + ONE_BYTES * 8,
a
)
)
)
);
@case *-x86+sse4.1
// This is actually 0x0001020304050607
uint64_t a = ~0ULL / 65025;
int y = _mm_movemask_epi8(
_mm_shuffle_epi8(
_mm_slli_epi32(x, 7),
_mm_set_epi64x(a, a + ONE_BYTES * 8)
)
);
@case *-aarch64+neon
auto masked = (uint8x16_t)vdupq_n_u64(POWERS_OF_TWO) & ('0' - x);
auto zipped = vzip_u8(vget_high_u8(masked), vget_low_u8(masked));
auto y = vaddvq_u16(
(uint16x8_t)vcombine_u8(zipped.val[0], zipped.val[1])
);
@case *-x86+none,*-aarch64+none
char y = char((x & ONE_BYTES) * BITSET_SHIFT >> 56);
@end
p += SIMD_SIZE;
memcpy((char*)&value + i / 8, &y, SIMD_SIZE / 8);
}
ptr = p;
!ifdef INTERACTIVE
}
!endif
}
!endif
template<typename T>
!ifdef INTERACTIVE
INLINE void rshift_impl(T& value) {
!else
INLINE blazingio_istream& operator>>(T& value) {
!endif
if (!is_same_v<T, line_t>)
// Skip whitespace. 0..' ' are not all whitespace, but we only care about well-formed
// input. We expect short runs here, hence no vectorization.
while (FETCH (uint8_t)*ptr < 33)
ptr++;
input(value);
!ifndef INTERACTIVE
return *this;
!endif
}
!ifdef STDIN_EOF
operator bool() {
return !!*this;
}
bool operator!() {
return ptr > end;
}
!endif
};
!ifdef INTERACTIVE
struct blazingio_istream {
istream_impl<false> file;
istream_impl<true> interactive;
blazingio_istream() {
// We want to switch to a pipe-based method if the file is a special device. This cannot be
// reliably detected by the return value of lseek(SEEK_END) because the returned value
// depends on the OS:
// - Linux returns -1, with errno EISPIPE.
// - Mac OS returns 0.
// - Windows returns 131072 (wtf), supposedly the size of the buffer.
// Therefore, don't try to be smart about this and just do an honest stat
struct stat stat_buf;
ensure(~fstat(STDIN_FILENO, &stat_buf))
// A real S_ISREG is sometimes unavailable (e.g. under MSVC), so simulate it
(stat_buf.st_mode >> 12) == 8
? file.init_assume_file(stat_buf.st_size)
: interactive.init_assume_interactive();
}
// For people writing cin.tie(0);
blazingio_istream* tie(nullptr_t) {
return this;
}
// For people writing cin.tie(0)->sync_with_stdio(0);
void sync_with_stdio(bool) {}
template<typename T>
INLINE blazingio_istream& operator>>(T& value) {
file.ptr
? file.rshift_impl(value)
: interactive.rshift_impl(value);
return *this;
}
!ifdef STDIN_EOF
operator bool() {
return !!*this;
}
bool operator!() {
return file.ptr ? !file : !interactive;
}
!endif
};
!endif
short decimal_lut[100];
char max_digits_by_log2[64]{1};
struct SPLIT_HERE blazingio_ostream {
char* base;
NonAliasingChar* ptr;
int ever_flushed;
blazingio_ostream() {
// We *could* use 'base = new char[0x20000000];' instead of mmap-based allocation here, but
// that would lead to problems on systems without overcommit, such as Windows.
// The size is limited by a bit greater than 0x20000000 because 32-bit WINE only allows to
// allocate that much.
// ejudge seems to be cursed. It supports RSS limits as opposed to VMA limits, but this
// feature is used neither by ej-polygon nor polygon-to-ejudge, which means we'll likely
// fail to allocate 0.5 GiB of address space for output. We can't even make it *growable*,
// really, because there's no way to reserve an address range. We, however, don't want to
// hinder feature support on sunwalker and Yandex.Contest (it got *something* right for
// once, maybe let's congratulate it with that and offer emotional support). Therefore, only
// reduce allocation size to 24 MiB if a limit is detected.
@define !CHECK_RLIMIT
@case linux-* rlimit rlim; getrlimit(RLIMIT_AS, &rlim); if (~rlim.rlim_cur) alloc_size = 0x1800000;
@case macos-*
@end
@match
@case linux-*,macos-*
// Avoid MAP_SHARED: it turns out it's pretty damn inefficient compared to a write at the
// end. This also allows us to allocate memory immediately without waiting for freopen,
// because we'll only use the fd in the destructor.
size_t alloc_size = 0x20000000;
CHECK_RLIMIT
base = (char*)mmap(
NULL,
alloc_size,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE,
-1,
0
);
// ejudge limits virtual address space in some cases. For instance, it does that by default
// on problems imported with ej-polygon.
ensure(base != MAP_FAILED)
@case windows-*
// Windows doesn't support anything like MAP_NORESERVE or overcommit. Therefore, reserve
// memory and use guard pages to extend the committed region.
ensure(base = (char*)VirtualAlloc(NULL, 0x20000000, MEM_RESERVE, PAGE_READWRITE))
ensure(VirtualAlloc(base, 4096, MEM_COMMIT, PAGE_READWRITE | PAGE_GUARD))
AddVectoredExceptionHandler(true, vectored_exception_handler);
@end
ptr = (NonAliasingChar*)base;
// The code gets shorter if we initialize LUT here as opposed to during compile time.
for (int i = 0; i < 100; i++)
decimal_lut[i] = short(('0' + i / 10) | (('0' + i % 10) << 8));
for (int i = 1; i < 64; i++)
max_digits_by_log2[i] = max_digits_by_log2[i - 1] + (0x8922489224892249 >> i & 1);
}
~blazingio_ostream() {
!ifdef INTERACTIVE
flush(
@match
@case linux-*,macos-*
@case windows-*
!ever_flushed
@end
);
}
void flush(
@match
@case linux-*,macos-*
@case windows-*
int attempt_direct_write = false
@end
) {
!endif
@match
@case linux-*,macos-*
auto start = base;
ssize_t n_written;
while ((n_written = write(STDOUT_FILENO, start, (char*)ptr - start)) > 0)
start += n_written;
ensure(~n_written)
@case windows-*
ever_flushed = true;
auto stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
!define WRAP_REOPEN(x) x
!ifdef INTERACTIVE
!undef WRAP_REOPEN
!define WRAP_REOPEN(x) attempt_direct_write ? x : INVALID_HANDLE_VALUE
!endif
auto handle = WRAP_REOPEN(ReOpenFile(
stdout_handle,
GENERIC_WRITE,
// Be as general as possible
FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH
));
DWORD n_written;
ensure(
handle == INVALID_HANDLE_VALUE
? WriteFile(stdout_handle, base, DWORD((char*)ptr - base), &n_written, NULL)
: (
WriteFile(handle, base, DWORD(((char*)ptr - base + 4095) & -4096), &n_written, NULL)
&& ~_chsize(1, int((char*)ptr - base))
)
)
@end
!ifdef INTERACTIVE
ptr = (NonAliasingChar*)base;
!endif
}
!ifndef INTERACTIVE
void flush() {}
!endif
void print(char value) {
*ptr++ = value;
}
!ifdef CHAR_WITH_SIGN_IS_GLYPH
void print(uint8_t value) {
*ptr++ = value;
}
void print(int8_t value) {
*ptr++ = value;
}
!endif
void print(bool value) {
*ptr++ = '0' + value;
}
template<typename T>
decltype((void)~T{1}) print(T value) {
using AbsT = make_unsigned_t<T>;
AbsT abs = value;
if (value < 0)
print('-'),
abs = NEGATE_MAYBE_UNSIGNED(abs);
!ifndef CHAR_WITH_SIGN_IS_GLYPH
if constexpr (sizeof(T) == 1) {
int digits = 1 + (abs > 9) + (abs > 99);
NonAliasingChar buf[6];
memcpy(buf, decimal_lut + abs / 10, 2);
buf[2] = '0' + char(abs % 10);
memcpy(ptr, buf + 3 - digits, 4);
ptr += digits;
return;
}
!endif
static constexpr auto powers_of_ten = []() {
array<AbsT, 5 * sizeof(T) / 2> powers_of_ten{};
AbsT n = 1;
for (size_t i = 1; i < powers_of_ten.size(); i++)
n *= 10,
powers_of_ten[i] = n;
return powers_of_ten;
}();
// We somehow need to skip leading zeroes. Do that by computing decimal length separately.
@match
@case linux-*,macos-*
@case windows-*
ULONG ilog2;
@end
int digits = max_digits_by_log2[
@define !BSRQ_WINDOWS
@case *-x86_64,*-aarch64 _BitScanReverse64(&ilog2, abs | 1)
@case *-i386 _BitScanReverse(&ilog2, ULONG((int64_t)abs >> 32)) ? ilog2 += 32 : _BitScanReverse(&ilog2, (ULONG)abs | 1)
@end
@match
@case windows-*
(BSRQ_WINDOWS, ilog2)
@case linux-*,macos-*
// This compiles to a single instruction on x64. |1 is to handle abs == 0 gracefully.
63 ^ __builtin_clzll(abs | 1)
@end
];
digits -= abs < powers_of_ten[digits - 1];
// This is a variation on Terje Mathisen's algorithm. See
// http://computer-programming-forum.com/46-asm/7aa4b50bce8dd985.htm
short buf[20];
if constexpr (sizeof(T) == 2) {
// We use a 32-bit fixed-point format here. The high 7 bits are the whole part and the
// low 25 low bits are the real part. 7 bits are used because that's the shortest amount
// of bits 99 fits in.
// abs / 1e3 in fixed point. 2^25 / 1e3 is actually 33554.432, but rounding it up to
// 33555 would introduce too big an error. We compensate for it by effectively using
// 33554.5 as the factor. The computed value, when multiplied back by 1e3, will have the
// whole part equal to (33554.5e3 * abs) >> 25. We want this to be less than 1 far from
// abs, i.e.
// ((33554.5e3 * abs) >> 25) - abs < 1,
// or
// (33554.5e3 - 2^25) * abs < 2^25.
// Luckily, this is true from all abs up to 2^16.
// The computation is a bit off for odd abs: in this case n is 1/2 larger than the
// theoretical value, which is a ridiculously small error, so the check still passes.
auto n = 33555U * abs - abs / 2;
uint64_t buf = decimal_lut[n >> 25];
n = (n & 0x01ffffff) * 25;
buf |= decimal_lut[n >> 23] << 16;
buf |= uint64_t('0' + ((n & 0x007fffff) * 5 >> 22)) << 32;
buf >>= 40 - digits * 8;
memcpy(ptr, &buf, 8);
} else if constexpr (sizeof(T) == 4) {
// We use a 64-bit fixed-point format here. The high 7 bits are the whole part and the
// low 57 low bits are the rneal part. 7 bits are used because that's the shortest
// amount of bits 99 fits in.
// abs / 1e8 in fixed point. 2^57 / 1e8 is actually 1441151880.7585588..., so we round
// it up. This introduces an error. The computed value, when multiplied back by 1e8,