-
Notifications
You must be signed in to change notification settings - Fork 476
/
cpu_cache.h
2772 lines (2428 loc) · 106 KB
/
cpu_cache.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef TCMALLOC_CPU_CACHE_H_
#define TCMALLOC_CPU_CACHE_H_
#include <stddef.h>
#include <stdint.h>
#include <sys/mman.h>
#include <algorithm>
#include <atomic>
#include <new>
#include <tuple>
#include <type_traits>
#include <utility>
#include "absl/algorithm/container.h"
#include "absl/base/attributes.h"
#include "absl/base/call_once.h"
#include "absl/base/dynamic_annotations.h"
#include "absl/base/internal/cycleclock.h"
#include "absl/base/internal/spinlock.h"
#include "absl/base/optimization.h"
#include "absl/base/thread_annotations.h"
#include "absl/container/fixed_array.h"
#include "absl/functional/function_ref.h"
#include "absl/time/time.h"
#include "absl/types/span.h"
#include "tcmalloc/common.h"
#include "tcmalloc/experiment.h"
#include "tcmalloc/experiment_config.h"
#include "tcmalloc/internal/allocation_guard.h"
#include "tcmalloc/internal/config.h"
#include "tcmalloc/internal/cpu_utils.h"
#include "tcmalloc/internal/environment.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal/numa.h"
#include "tcmalloc/internal/optimization.h"
#include "tcmalloc/internal/percpu.h"
#include "tcmalloc/internal/percpu_tcmalloc.h"
#include "tcmalloc/internal/sysinfo.h"
#include "tcmalloc/pages.h"
#include "tcmalloc/parameters.h"
#include "tcmalloc/static_vars.h"
#include "tcmalloc/thread_cache.h"
#include "tcmalloc/transfer_cache.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
class CpuCachePeer;
namespace cpu_cache_internal {
template <class CpuCache>
struct DrainHandler;
// Determine number of bits we should use for allocating per-cpu cache.
// The amount of per-cpu cache is 2 ^ per-cpu-shift.
// When dynamic slab size is enabled, we start with kInitialPerCpuShift and
// grow as needed up to kMaxPerCpuShift. When dynamic slab size is disabled,
// we always use kMaxPerCpuShift.
#if defined(TCMALLOC_INTERNAL_SMALL_BUT_SLOW)
constexpr inline uint8_t kInitialBasePerCpuShift = 12;
constexpr inline uint8_t kMaxBasePerCpuShift = 12;
#else
constexpr inline uint8_t kInitialBasePerCpuShift = 14;
constexpr inline uint8_t kMaxBasePerCpuShift = 18;
#endif
constexpr inline uint8_t kNumPossiblePerCpuShifts =
kMaxBasePerCpuShift - kInitialBasePerCpuShift + 1;
constexpr inline uint8_t kResizeSlabCopies = 2;
constexpr inline uint8_t kTotalPossibleSlabs =
kNumPossiblePerCpuShifts * kResizeSlabCopies;
// StaticForwarder provides access to the SizeMap and transfer caches.
//
// This is a class, rather than namespaced globals, so that it can be mocked for
// testing.
class StaticForwarder {
public:
[[nodiscard]] static void* Alloc(size_t size, std::align_val_t alignment)
ABSL_LOCKS_EXCLUDED(pageheap_lock) {
TC_ASSERT(tc_globals.IsInited());
PageHeapSpinLockHolder l;
return tc_globals.arena().Alloc(size, alignment);
}
[[nodiscard]] static void* AllocReportedImpending(size_t size,
std::align_val_t alignment)
ABSL_LOCKS_EXCLUDED(pageheap_lock) {
TC_ASSERT(tc_globals.IsInited());
PageHeapSpinLockHolder l;
// Negate previous update to allocated that accounted for this allocation.
tc_globals.arena().UpdateAllocatedAndNonresident(
-static_cast<int64_t>(size), 0);
return tc_globals.arena().Alloc(size, alignment);
}
static void Dealloc(void* ptr, size_t size, std::align_val_t alignment) {
TC_ASSERT(false);
}
static void ArenaUpdateAllocatedAndNonresident(int64_t allocated,
int64_t nonresident)
ABSL_LOCKS_EXCLUDED(pageheap_lock) {
TC_ASSERT(tc_globals.IsInited());
PageHeapSpinLockHolder l;
if (allocated > 0) {
tc_globals.page_allocator().ShrinkToUsageLimit(Length(allocated));
}
tc_globals.arena().UpdateAllocatedAndNonresident(allocated, nonresident);
}
static bool per_cpu_caches_dynamic_slab_enabled() {
return Parameters::per_cpu_caches_dynamic_slab_enabled();
}
static double per_cpu_caches_dynamic_slab_grow_threshold() {
return Parameters::per_cpu_caches_dynamic_slab_grow_threshold();
}
static double per_cpu_caches_dynamic_slab_shrink_threshold() {
return Parameters::per_cpu_caches_dynamic_slab_shrink_threshold();
}
static size_t class_to_size(int size_class) {
return tc_globals.sizemap().class_to_size(size_class);
}
static absl::Span<const size_t> cold_size_classes() {
return tc_globals.sizemap().ColdSizeClasses();
}
static size_t num_objects_to_move(int size_class) {
return tc_globals.sizemap().num_objects_to_move(size_class);
}
static const NumaTopology<kNumaPartitions, kNumBaseClasses>& numa_topology() {
return tc_globals.numa_topology();
}
static ShardedTransferCacheManager& sharded_transfer_cache() {
return tc_globals.sharded_transfer_cache();
}
static TransferCacheManager& transfer_cache() {
return tc_globals.transfer_cache();
}
static bool UseGenericShardedCache() {
return tc_globals.sharded_transfer_cache().UseGenericCache();
}
static bool UseShardedCacheForLargeClassesOnly() {
return tc_globals.sharded_transfer_cache().UseCacheForLargeClassesOnly();
}
// We use size class maximum capacities as configured in sizemap.
//
// TODO(b/311398687): re-enable this experiment.
static bool ConfigureSizeClassMaxCapacity() { return false; }
};
template <typename NumaTopology>
uint8_t NumaShift(const NumaTopology& topology) {
return topology.numa_aware()
? absl::bit_ceil(topology.active_partitions() - 1)
: 0;
}
// Translates from a shift value to the offset of that shift in arrays of
// possible shift values.
inline uint8_t ShiftOffset(uint8_t shift, uint8_t initial_shift) {
TC_ASSERT_GE(shift, initial_shift);
return shift - initial_shift;
}
// Tracks the range of allowed slab shifts.
struct SlabShiftBounds {
uint8_t initial_shift;
uint8_t max_shift;
};
struct GetShiftMaxCapacity {
size_t operator()(size_t size_class) const {
TC_ASSERT_GE(shift_bounds.max_shift, shift);
const uint8_t relative_shift = shift_bounds.max_shift - shift;
if (relative_shift == 0)
return max_capacities[size_class].load(std::memory_order_relaxed);
int mc = max_capacities[size_class].load(std::memory_order_relaxed) >>
relative_shift;
// We decrement by 3 because of (1) cost of per-size-class header, (2) cost
// of per-size-class padding pointer, (3) there are a lot of empty size
// classes that have headers and whose max capacities can't be decremented.
// TODO(b/272085443): try using size_class_to_header_idx array to allow for
// not having headers for empty size classes.
// TODO(b/219565872): try not doing prefetching for large size classes to
// allow for not having padding pointers for large size classes.
mc = std::max(mc - 3, 0);
return mc;
}
const std::atomic<uint16_t>* max_capacities;
uint8_t shift;
SlabShiftBounds shift_bounds;
};
template <typename Forwarder>
class CpuCache {
public:
struct CpuCacheMissStats {
size_t underflows = 0;
size_t overflows = 0;
CpuCacheMissStats& operator+=(const CpuCacheMissStats rhs) {
underflows += rhs.underflows;
overflows += rhs.overflows;
return *this;
}
};
enum class DynamicSlabResize {
kNoop = 0,
kShrink,
kGrow,
};
enum class PerClassMissType {
// Tracks total number of capacity misses.
kCapacityTotal = 0,
// Tracks number of misses recorded as of the end of the last per-class
// resize interval.
kCapacityResize,
// Tracks total number of misses due to insufficient max_capacity.
kMaxCapacityTotal,
// Tracks number of misses recorded as of the end of the last per-class
// max capacity resize interval.
kMaxCapacityResize,
kNumTypes,
};
// We track the number of overflows/underflows for each of these cases.
enum class MissCount {
// Tracks total number of misses.
kTotal = 0,
// Tracks number of misses recorded as of the end of the last shuffle
// interval.
kShuffle,
// Tracks number of misses recorded as of the end of the last resize
// interval.
kReclaim,
// Tracks number of misses recorded as of the end of the last slab resize
// interval.
kSlabResize,
kNumCounts,
};
struct SizeClassCapacityStats {
size_t min_capacity = 0;
double avg_capacity = 0;
size_t max_capacity = 0;
size_t max_capacity_misses = 0;
absl::Duration min_last_underflow = absl::InfiniteDuration();
absl::Duration max_last_underflow;
absl::Duration min_last_overflow = absl::InfiniteDuration();
absl::Duration max_last_overflow;
int min_last_underflow_cpu_id = -1;
int max_last_underflow_cpu_id = -1;
int min_last_overflow_cpu_id = -1;
int max_last_overflow_cpu_id = -1;
};
struct DynamicSlabInfo {
std::atomic<size_t> grow_count[kNumPossiblePerCpuShifts];
std::atomic<size_t> shrink_count[kNumPossiblePerCpuShifts];
std::atomic<size_t> madvise_failed_bytes;
};
// Sets the lower limit on the capacity that can be stolen from the cpu cache.
static constexpr double kCacheCapacityThreshold = 0.20;
constexpr CpuCache() = default;
// tcmalloc explicitly initializes its global state (to be safe for
// use in global constructors) so our constructor must be trivial;
// do all initialization here instead.
void Activate();
// For testing
void Deactivate();
// Allocate an object of the given size class.
// Returns nullptr when allocation fails.
[[nodiscard]] void* Allocate(size_t size_class);
// Separate allocation fast/slow paths.
// The fast path succeeds iff the thread has already cached the slab pointer
// (done by AllocateSlow) and there is an available object in the slab.
[[nodiscard]] void* AllocateFast(size_t size_class);
[[nodiscard]] void* AllocateSlow(size_t size_class);
// A slightly faster version of AllocateSlow that may be called only
// when it's known that no hooks are installed.
[[nodiscard]] void* AllocateSlowNoHooks(size_t size_class);
// Free an object of the given class.
void Deallocate(void* ptr, size_t size_class);
// Separate deallocation fast/slow paths.
// The fast path succeeds iff the thread has already cached the slab pointer
// (done by DeallocateSlow) and there is free space in the slab.
bool DeallocateFast(void* ptr, size_t size_class);
void DeallocateSlow(void* ptr, size_t size_class);
// A slightly faster version of DeallocateSlow that may be called only
// when it's known that no hooks are installed.
void DeallocateSlowNoHooks(void* ptr, size_t size_class);
// Force all Allocate/DeallocateFast to fail in the current thread
// if malloc hooks are installed.
void MaybeForceSlowPath();
// Give the number of bytes in <cpu>'s cache
uint64_t UsedBytes(int cpu) const;
// Give the allocated number of bytes in <cpu>'s cache
uint64_t Allocated(int cpu) const;
// Whether <cpu>'s cache has ever been populated with objects
bool HasPopulated(int cpu) const;
PerCPUMetadataState MetadataMemoryUsage() const;
// Give the number of bytes used in all cpu caches.
uint64_t TotalUsedBytes() const;
// Give the number of objects of a given class in all cpu caches.
uint64_t TotalObjectsOfClass(size_t size_class) const;
// Give the number of bytes unallocated to any sizeclass in <cpu>'s cache.
uint64_t Unallocated(int cpu) const;
// Gives the total capacity of <cpu>'s cache in bytes.
//
// The total capacity of <cpu>'s cache should be equal to the sum of allocated
// and unallocated bytes for that cache.
uint64_t Capacity(int cpu) const;
// Give the per-cpu limit of cache size.
uint64_t CacheLimit() const;
void SetCacheLimit(uint64_t v);
// Shuffles per-cpu caches using the number of underflows and overflows that
// occurred in the prior interval. It selects the top per-cpu caches
// with highest misses as candidates, iterates through the other per-cpu
// caches to steal capacity from them and adds the stolen bytes to the
// available capacity of the per-cpu caches. May be called from any processor.
//
// TODO(vgogte): There are quite a few knobs that we can play around with in
// ShuffleCpuCaches.
void ShuffleCpuCaches();
// Tries to reclaim inactive per-CPU caches. It iterates through the set of
// populated cpu caches and reclaims the caches that:
// (1) had same number of used bytes since the last interval,
// (2) had no change in the number of misses since the last interval.
void TryReclaimingCaches();
// Resize size classes for up to kNumCpuCachesToResize cpu caches per
// interval.
static constexpr int kNumCpuCachesToResize = 10;
// Resizes size classes within up to kNumCpuCachesToResize per-cpu caches per
// iteration in a round-robin fashion. Per cpu cache, it iterates through the
// size classes and attempts to grow up to kMaxSizeClassesToResize number of
// classes by stealing capacity from rest of them. Per iteration, it resizes
// size classes for up to kNumCpuCachesToResize number of per-cpu caches.
void ResizeSizeClasses();
// Gets the max capacity for the size class using the current per-cpu shift.
uint16_t GetMaxCapacity(int size_class, uint8_t shift) const;
// Gets the current capacity for the <size_class> in a <cpu> cache.
size_t GetCapacityOfSizeClass(int cpu, int size_class) const;
// Computes maximum capacities that we want to update the size classes to. It
// fetches number of capacity misses obvserved for the size classes, and
// computes increases to the maximum capacities for the size classes with the
// highest misses. It computes maximum capacities for kNumBaseClasses number
// of size classes, starting with <start_size_class>. It records the resized
// size classes and capacities in <max_capacity> starting from index
// <valid_entries>.
// Returns total number of valid size classes recorded in <max_capacity>
// array.
int GetUpdatedMaxCapacities(int start_size_class,
PerSizeClassMaxCapacity* max_capacity,
int valid_entries);
// Resizes maximum capacities for the size classes. First, it computes
// candidates to resize using GetUpdatedMaxCapacities(...), and then updates
// maximum capacities for size classes for all per-cpu caches. Resizing is a
// global operation. It stops all per-cpu caches, drains them, updates maximum
// capacities and begin, current and end indices for the slabs and then
// restarts the per-cpu caches. Because it's a global operation that involves
// stopping all per-cpu caches, this mechanism should be used sparingly.
void ResizeSizeClassMaxCapacities();
// Empty out the cache on <cpu>; move all objects to the central
// cache. (If other threads run concurrently on that cpu, we can't
// guarantee it will be fully empty on return, but if the cpu is
// unused, this will eliminate stranded memory.) Returns the number
// of bytes we sent back. This function is thread safe.
uint64_t Reclaim(int cpu);
// Reports number of times the size classes were resized for <cpu>.
uint64_t GetNumResizes(int cpu) const;
// Reports total number of times size classes were resized.
uint64_t GetNumResizes() const;
// Reports number of times the <cpu> has been reclaimed.
uint64_t GetNumReclaims(int cpu) const;
// Reports total number of times any CPU has been reclaimed.
uint64_t GetNumReclaims() const;
// When dynamic slab size is enabled, checks if there is a need to resize
// the slab based on miss-counts and resizes if so.
void ResizeSlabIfNeeded();
// Reports total cache underflows and overflows for <cpu>.
CpuCacheMissStats GetTotalCacheMissStats(int cpu) const;
// Reports total cache underflows and overflows for all CPUs.
CpuCacheMissStats GetTotalCacheMissStats() const;
// Reports the cache underflows and overflows for <cpu> that were recorded
// during the previous interval for <miss_count>.
CpuCacheMissStats GetIntervalCacheMissStats(int cpu,
MissCount miss_count) const;
// Records current underflows and overflows in the <miss_count> underflow and
// overflow stats.
void UpdateIntervalCacheMissStats(int cpu, MissCount miss_count);
// Reports the cache underflows and overflows for <cpu> that were recorded
// during the previous interval for <miss_count>. Records current underflows
// and overflows in the <miss_count> underflow and overflow stats.
CpuCacheMissStats GetAndUpdateIntervalCacheMissStats(int cpu,
MissCount miss_count);
// Scans through populated per-CPU caches, and reports minimum, average and
// maximum capacity for size class <size_class>.
SizeClassCapacityStats GetSizeClassCapacityStats(size_t size_class) const;
// Reports the number of misses encountered by a <size_class> that were
// recorded during the previous interval between <total_type> and
// <interval_type> kinds of misses.
size_t GetIntervalSizeClassMisses(int cpu, size_t size_class,
PerClassMissType total_type,
PerClassMissType interval_type);
// Reports if we should use a wider 512KiB slab.
bool UseWiderSlabs() const;
// Reports if per-size-class maximum capacities configured in sizemap are
// used.
bool ConfigureSizeClassMaxCapacity() const;
// Reports allowed slab shift initial and maximum bounds.
SlabShiftBounds GetPerCpuSlabShiftBounds() const;
size_t GetDynamicSlabFailedBytes() const;
// Report statistics
void Print(Printer* out) const;
void PrintInPbtxt(PbtxtRegion* region) const;
const Forwarder& forwarder() const { return forwarder_; }
Forwarder& forwarder() { return forwarder_; }
private:
friend struct DrainHandler<CpuCache>;
friend class ::tcmalloc::tcmalloc_internal::CpuCachePeer;
using Freelist = subtle::percpu::TcmallocSlab<kNumClasses>;
struct PerClassMissCounts {
std::atomic<size_t>
misses[static_cast<size_t>(PerClassMissType::kNumTypes)];
std::atomic<size_t>& operator[](PerClassMissType type) {
return misses[static_cast<size_t>(type)];
}
};
// Per-size-class freelist resizing info.
class PerClassResizeInfo {
public:
void Init();
// Updates info on overflow/underflow.
// <overflow> says if it's overflow or underflow.
// <grow> is caller approximation of whether we want to grow capacity.
// <successive> will contain number of successive overflows/underflows.
// Returns if capacity needs to be grown aggressively (i.e. by batch size).
bool Update(bool overflow, bool grow, uint32_t* successive);
uint32_t Tick();
// Records a miss for a provided <type>. A miss occurs when size class
// attempts to grow it's capacity on underflow/overflow, but we are already
// at the maximum configured per-cpu cache capacity limit.
void RecordMiss(PerClassMissType type);
// Reports total number of misses recorded for this size class.
size_t GetTotalMisses(PerClassMissType type);
size_t GetAndUpdateIntervalMisses(PerClassMissType total_type,
PerClassMissType interval_type);
// Reports the number of misses encountered by this size class that
// were recorded during the previous interval between misses <total_type>
// and <interval_type>.
size_t GetIntervalMisses(PerClassMissType total_type,
PerClassMissType interval_type);
// Copies total misses of type <total_type> encountered by the size class to
// the type <interval_type>.
void UpdateIntervalMisses(PerClassMissType total_type,
PerClassMissType interval_type);
private:
std::atomic<int32_t> state_;
// state_ layout:
struct State {
// last overflow/underflow?
uint32_t overflow : 1;
// number of times Steal checked this class since the last grow
uint32_t quiescent_ticks : 15;
// number of successive overflows/underflows
uint32_t successive : 16;
};
PerClassMissCounts misses_;
static_assert(sizeof(State) == sizeof(std::atomic<int32_t>),
"size mismatch");
};
// Helper type so we don't need to sprinkle `static_cast`s everywhere.
struct MissCounts {
std::atomic<size_t> misses[static_cast<size_t>(MissCount::kNumCounts)];
std::atomic<size_t>& operator[](MissCount miss_count) {
return misses[static_cast<size_t>(miss_count)];
}
};
struct ABSL_CACHELINE_ALIGNED ResizeInfo {
// cache space on this CPU we're not using. Modify atomically;
// we don't want to lose space.
std::atomic<size_t> available;
// Size class to steal from for the clock-wise algorithm.
size_t next_steal = 1;
// Track whether we have ever populated this CPU.
std::atomic<bool> populated;
// For cross-cpu operations. We can't allocate while holding one of these so
// please use AllocationGuardSpinLockHolder to hold it.
absl::base_internal::SpinLock lock ABSL_ACQUIRED_BEFORE(pageheap_lock){
absl::kConstInit, absl::base_internal::SCHEDULE_KERNEL_ONLY};
PerClassResizeInfo per_class[kNumClasses];
std::atomic<size_t> num_size_class_resizes;
// Tracks number of underflows on allocate.
MissCounts underflows;
// Tracks number of overflows on deallocate.
MissCounts overflows;
std::atomic<int64_t> last_miss_cycles[2][kNumClasses];
// total cache space available on this CPU. This tracks the total
// allocated and unallocated bytes on this CPU cache.
std::atomic<size_t> capacity;
// Used bytes in the cache as of the end of the last resize interval.
std::atomic<uint64_t> reclaim_used_bytes;
// Tracks number of times this CPU has been reclaimed.
std::atomic<size_t> num_reclaims;
// Tracks last time this CPU was reclaimed. If last underflow/overflow data
// appears before this point in time, we ignore the CPU.
std::atomic<int64_t> last_reclaim;
};
// Determines how we distribute memory in the per-cpu cache to the various
// class sizes.
size_t MaxCapacity(size_t size_class) const;
// Updates maximum capacity for the <size_class> to <cap>.
void UpdateMaxCapacity(int size_class, uint16_t cap);
GetShiftMaxCapacity GetMaxCapacityFunctor(uint8_t shift) const;
// Fetches objects from backing transfer cache.
[[nodiscard]] int FetchFromBackingCache(size_t size_class,
absl::Span<void*> batch);
// Releases free batch of objects to the backing transfer cache.
void ReleaseToBackingCache(size_t size_class, absl::Span<void*> batch);
[[nodiscard]] void* Refill(int cpu, size_t size_class);
std::pair<int, bool> CacheCpuSlab();
void Populate(int cpu);
// Returns true if we bypass cpu cache for a <size_class>. We may bypass
// per-cpu cache when we enable certain configurations of sharded transfer
// cache.
bool BypassCpuCache(size_t size_class) const;
// Returns true if we use sharded transfer cache as a backing cache for
// per-cpu caches. If a sharded transfer cache is used, we fetch/release
// from/to a sharded transfer cache. Else, we use a legacy transfer cache.
bool UseBackingShardedTransferCache(size_t size_class) const;
// Called on <size_class> freelist on <cpu> to record overflow/underflow
// Returns number of objects to return/request from transfer cache.
size_t UpdateCapacity(int cpu, size_t size_class, bool overflow);
// Tries to grow freelist <size_class> on the current <cpu> by up to
// <desired_increase> objects if there is available capacity.
void Grow(int cpu, size_t size_class, size_t desired_increase);
// Depending on the number of misses that cpu caches encountered in the
// previous resize interval, returns if slabs should be grown, shrunk or
// remain the same.
DynamicSlabResize ShouldResizeSlab();
// Determine if the <size_class> is a good candidate to be shrunk. We use
// clock-like algorithm to prioritize size classes for shrinking.
bool IsGoodCandidateForShrinking(int cpu, size_t size_class);
struct SizeClassMissStat {
size_t size_class;
size_t misses;
};
struct CpuMissStat {
int cpu;
size_t misses;
};
// Tries to steal <bytes> for <size_class> on <cpu> from other size classes on
// that CPU. Returns acquired bytes.
size_t StealCapacityForSizeClassWithinCpu(
int cpu, absl::Span<SizeClassMissStat> dest_size_classes, size_t bytes);
// Records a cache underflow or overflow on <cpu>, increments underflow or
// overflow by 1.
// <is_alloc> determines whether the associated count corresponds to an
// underflow or overflow.
void RecordCacheMissStat(int cpu, bool is_alloc);
// Tries to steal <bytes> for the destination <cpu>. It iterates through the
// the set of populated cpu caches and steals the bytes from them. A cpu is
// considered a good candidate to steal from if:
// (1) the cache is populated
// (2) the numbers of underflows and overflows are both less than 0.8x those
// of the destination per-cpu cache
// (3) source cpu is not the same as the destination cpu
// (4) capacity of the source cpu/size_class is non-zero
//
// For a given source cpu, we iterate through the size classes to steal from
// them. Currently, we use a clock-like algorithm to identify the size_class
// to steal from.
void StealFromOtherCache(int cpu, int max_populated_cpu,
absl::Span<CpuMissStat> skip_cpus, size_t bytes);
// Try to steal one object from cpu/size_class. Return bytes stolen.
size_t ShrinkOtherCache(int cpu, size_t size_class);
// Resizes capacities of up to kMaxSizeClassesToResize size classes for a
// single <cpu>.
void ResizeCpuSizeClasses(int cpu);
// <shift_offset> is the offset of the shift in slabs_by_shift_. Note that we
// can't calculate this from `shift` directly due to numa shift.
// Returns the allocated slabs and the number of reused bytes.
[[nodiscard]] std::pair<void*, size_t> AllocOrReuseSlabs(
absl::FunctionRef<void*(size_t, std::align_val_t)> alloc,
subtle::percpu::Shift shift, int num_cpus, uint8_t shift_offset,
uint8_t resize_offset);
// madvise-away slab memory, pointed to by <slab_addr> of size <slab_size>.
void MadviseAwaySlabs(void* slab_addr, size_t slab_size);
Freelist freelist_;
// Tracking data for each CPU's cache resizing efforts.
ResizeInfo* resize_ = nullptr;
// Tracks initial and maximum slab shift bounds.
SlabShiftBounds shift_bounds_{};
// The maximum capacity of each size class within the slab.
std::atomic<uint16_t> max_capacity_[kNumClasses] = {0};
// Provides a hint to StealFromOtherCache() so that we can steal from the
// caches in a round-robin fashion.
int next_cpu_cache_steal_ = 0;
// Provides a hint to ResizeSizeClasses() that records the last CPU for which
// we resized size classes. We use this to resize size classes for CPUs in a
// round-robin fashion.
std::atomic<int> last_cpu_size_class_resize_ = 0;
// Records the slab copy currently in use. We maintain kResizeSlabCopies
// sets of kNumPossiblePerCpuShifts slabs. While resizing maximum size class
// capacity, we choose a new slab from one of the copies. resize_slab_offset_
// is an index into the copy currently in use.
std::atomic<int> resize_slab_offset_ = 0;
// Per-core cache limit in bytes.
std::atomic<uint64_t> max_per_cpu_cache_size_{kMaxCpuCacheSize};
ABSL_ATTRIBUTE_NO_UNIQUE_ADDRESS Forwarder forwarder_;
DynamicSlabInfo dynamic_slab_info_{};
// Pointers to allocations for slabs of each shift value for use in
// ResizeSlabs. This memory is allocated on the arena, and it is nonresident
// while not in use.
void* slabs_by_shift_[kTotalPossibleSlabs] = {nullptr};
};
template <class Forwarder>
void* CpuCache<Forwarder>::Allocate(size_t size_class) {
void* ret = AllocateFast(size_class);
if (ABSL_PREDICT_TRUE(ret != nullptr)) {
return ret;
}
TCMALLOC_MUSTTAIL return AllocateSlow(size_class);
}
template <class Forwarder>
inline ABSL_ATTRIBUTE_ALWAYS_INLINE void* CpuCache<Forwarder>::AllocateFast(
size_t size_class) {
TC_ASSERT_GT(size_class, 0);
return freelist_.Pop(size_class);
}
template <class Forwarder>
void CpuCache<Forwarder>::Deallocate(void* ptr, size_t size_class) {
if (ABSL_PREDICT_FALSE(!DeallocateFast(ptr, size_class))) {
TCMALLOC_MUSTTAIL return DeallocateSlow(ptr, size_class);
}
}
template <class Forwarder>
inline ABSL_ATTRIBUTE_ALWAYS_INLINE bool CpuCache<Forwarder>::DeallocateFast(
void* ptr, size_t size_class) {
TC_ASSERT_GT(size_class, 0);
return freelist_.Push(size_class, ptr);
}
template <class Forwarder>
void CpuCache<Forwarder>::MaybeForceSlowPath() {
if (ABSL_PREDICT_FALSE(Static::HaveHooks())) {
freelist_.UncacheCpuSlab();
}
}
static CpuSet FillActiveCpuMask() {
CpuSet allowed_cpus;
if (!allowed_cpus.GetAffinity(0)) {
allowed_cpus.Zero();
}
#ifdef PERCPU_USE_RSEQ
const bool real_cpus = !subtle::percpu::UsingVirtualCpus();
#else
const bool real_cpus = true;
#endif
if (real_cpus) {
return allowed_cpus;
}
const int virtual_cpu_count = allowed_cpus.Count();
allowed_cpus.Zero();
for (int cpu = 0; cpu < virtual_cpu_count; ++cpu) {
allowed_cpus.Set(cpu);
}
return allowed_cpus;
}
template <class Forwarder>
inline size_t CpuCache<Forwarder>::MaxCapacity(size_t size_class) const {
// The number of size classes that are commonly used and thus should be
// allocated more slots in the per-cpu cache.
static constexpr size_t kNumSmall = 10;
// When we use wider slabs, we also want to double the maximum capacities for
// size classes to use that slab.
const size_t kWiderSlabMultiplier = UseWiderSlabs() ? 2 : 1;
// The memory used for each per-CPU slab is the sum of:
// sizeof(std::atomic<int64_t>) * kNumClasses
// sizeof(void*) * (kSmallObjectDepth + 1) * kNumSmall
// sizeof(void*) * (kLargeObjectDepth + 1) * kNumLarge
//
// Class size 0 has MaxCapacity() == 0, which is the reason for using
// kNumClasses - 1 above instead of kNumClasses.
//
// Each Size class region in the slab is preceded by one padding pointer that
// points to itself, because prefetch instructions of invalid pointers are
// slow. That is accounted for by the +1 for object depths.
#if defined(TCMALLOC_INTERNAL_SMALL_BUT_SLOW)
// With SMALL_BUT_SLOW we have 4KiB of per-cpu slab and 46 class sizes we
// allocate:
// == 8 * 46 + 8 * ((16 + 1) * 10 + (6 + 1) * 35) = 4038 bytes of 4096
static const uint16_t kSmallObjectDepth = 16;
static const uint16_t kLargeObjectDepth = 6;
#else
// We allocate 256KiB per-cpu for pointers to cached per-cpu memory.
// Max(kNumClasses) is 89, so the maximum footprint per CPU for a 256KiB
// slab is:
// 89 * 8 + 8 * ((2048 + 1) * 10 + (152 + 1) * 78) = 254 KiB
// For 512KiB slab, with a multiplier of 2, maximum footprint is:
// 89 * 8 + 8 * ((4096 + 1) * 10 + (304 + 1) * 78) = 506 KiB
const uint16_t kSmallObjectDepth =
(ConfigureSizeClassMaxCapacity()
? tc_globals.sizemap().max_capacity(size_class)
: 2048 * kWiderSlabMultiplier);
const uint16_t kLargeObjectDepth =
(ConfigureSizeClassMaxCapacity()
? tc_globals.sizemap().max_capacity(size_class)
: 152 * kWiderSlabMultiplier);
#endif
if (size_class == 0 || size_class >= kNumClasses) {
return 0;
}
if (BypassCpuCache(size_class)) {
return 0;
}
if (forwarder_.class_to_size(size_class) == 0) {
return 0;
}
if (!IsExpandedSizeClass(size_class) &&
(size_class % kNumBaseClasses) <= kNumSmall) {
// Small object sizes are very heavily used and need very deep caches for
// good performance (well over 90% of malloc calls are for size_class
// <= 10.)
return kSmallObjectDepth;
}
if (ColdFeatureActive()) {
// We reduce the number of cached objects for some sizes to fit into the
// slab.
const uint16_t kLargeUninterestingObjectDepth =
(ConfigureSizeClassMaxCapacity()
? tc_globals.sizemap().max_capacity(size_class)
: 133 * kWiderSlabMultiplier);
const uint16_t kLargeInterestingObjectDepth = 28 * kWiderSlabMultiplier;
absl::Span<const size_t> cold = forwarder_.cold_size_classes();
if (absl::c_binary_search(cold, size_class)) {
return kLargeInterestingObjectDepth;
} else if (!IsExpandedSizeClass(size_class)) {
return kLargeUninterestingObjectDepth;
} else {
return 0;
}
}
if (IsExpandedSizeClass(size_class)) {
return 0;
}
return kLargeObjectDepth;
}
// Returns estimated bytes required and the bytes available.
inline std::pair<size_t, size_t> EstimateSlabBytes(
GetShiftMaxCapacity get_shift_capacity) {
size_t bytes_required = sizeof(std::atomic<int64_t>) * kNumClasses;
for (int size_class = 0; size_class < kNumClasses; ++size_class) {
// Each non-empty size class region in the slab is preceded by one padding
// pointer that points to itself. (We do this because prefetches of invalid
// pointers are slow.)
size_t num_pointers = get_shift_capacity(size_class);
if (num_pointers > 0) ++num_pointers;
bytes_required += sizeof(void*) * num_pointers;
}
const size_t bytes_available = 1 << get_shift_capacity.shift;
return {bytes_required, bytes_available};
}
template <class Forwarder>
inline uint16_t CpuCache<Forwarder>::GetMaxCapacity(int size_class,
uint8_t shift) const {
return GetMaxCapacityFunctor(shift)(size_class);
}
template <class Forwarder>
inline size_t CpuCache<Forwarder>::GetCapacityOfSizeClass(
int cpu, int size_class) const {
return freelist_.Capacity(cpu, size_class);
}
template <class Forwarder>
inline GetShiftMaxCapacity CpuCache<Forwarder>::GetMaxCapacityFunctor(
uint8_t shift) const {
return {max_capacity_, shift, shift_bounds_};
}
template <class Forwarder>
inline void CpuCache<Forwarder>::UpdateMaxCapacity(int size_class,
uint16_t cap) {
max_capacity_[size_class].store(cap, std::memory_order_relaxed);
}
template <class Forwarder>
inline bool CpuCache<Forwarder>::UseWiderSlabs() const {
// We use wider 512KiB slab only when NUMA partitioning is not enabled. NUMA
// increases shift by 1 by itself, so we can not increase it further.
return !forwarder_.numa_topology().numa_aware();
}
template <class Forwarder>
inline bool CpuCache<Forwarder>::ConfigureSizeClassMaxCapacity() const {
return forwarder_.ConfigureSizeClassMaxCapacity();
}
template <class Forwarder>
inline SlabShiftBounds CpuCache<Forwarder>::GetPerCpuSlabShiftBounds() const {
return shift_bounds_;
}
template <class Forwarder>
inline size_t CpuCache<Forwarder>::GetDynamicSlabFailedBytes() const {
return dynamic_slab_info_.madvise_failed_bytes.load(
std::memory_order_relaxed);
}
template <class Forwarder>
inline void CpuCache<Forwarder>::Activate() {
int num_cpus = NumCPUs();
shift_bounds_.initial_shift = kInitialBasePerCpuShift;
shift_bounds_.max_shift = kMaxBasePerCpuShift;
uint8_t per_cpu_shift = forwarder_.per_cpu_caches_dynamic_slab_enabled()
? kInitialBasePerCpuShift
: kMaxBasePerCpuShift;
const auto& topology = forwarder_.numa_topology();
const uint8_t numa_shift = NumaShift(topology);
const uint8_t wider_slab_shift = UseWiderSlabs() ? 1 : 0;
shift_bounds_.initial_shift += numa_shift + wider_slab_shift;
shift_bounds_.max_shift += numa_shift + wider_slab_shift;
per_cpu_shift += numa_shift + wider_slab_shift;
TC_CHECK_LE(shift_bounds_.initial_shift, shift_bounds_.max_shift);
TC_CHECK_GE(per_cpu_shift, shift_bounds_.initial_shift);
TC_CHECK_LE(per_cpu_shift, shift_bounds_.max_shift);
TC_CHECK_EQ(shift_bounds_.max_shift - shift_bounds_.initial_shift + 1,
kNumPossiblePerCpuShifts);
// Deal with size classes that correspond only to NUMA partitions that are in
// use. If NUMA awareness is disabled then we may have a smaller shift than
// would suffice for all of the unused size classes.
for (int size_class = 0;
size_class < topology.active_partitions() * kNumBaseClasses;
++size_class) {
max_capacity_[size_class].store(MaxCapacity(size_class),
std::memory_order_relaxed);
}
// Deal with expanded size classes.
for (int size_class = kExpandedClassesStart; size_class < kNumClasses;
++size_class) {
max_capacity_[size_class].store(MaxCapacity(size_class),
std::memory_order_relaxed);
}
// Verify that all the possible shifts will have valid max capacities.
for (uint8_t shift = shift_bounds_.initial_shift;
shift <= shift_bounds_.max_shift; ++shift) {
const auto [bytes_required, bytes_available] =
EstimateSlabBytes({max_capacity_, shift, shift_bounds_});
// We may make certain size classes no-ops by selecting "0" at runtime, so
// using a compile-time calculation overestimates worst-case memory usage.
if (ABSL_PREDICT_FALSE(bytes_required > bytes_available)) {
TC_BUG("per-CPU memory exceeded, have %v, need %v", bytes_available,
bytes_required);
}
}