-
Notifications
You must be signed in to change notification settings - Fork 460
/
walingest.rs
2174 lines (1967 loc) · 78.7 KB
/
walingest.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//!
//! Parse PostgreSQL WAL records and store them in a neon Timeline.
//!
//! The pipeline for ingesting WAL looks like this:
//!
//! WAL receiver -> [`wal_decoder`] -> WalIngest -> Repository
//!
//! The WAL receiver receives a stream of WAL from the WAL safekeepers.
//! Records get decoded and interpreted in the [`wal_decoder`] module
//! and then stored to the Repository by WalIngest.
//!
//! The neon Repository can store page versions in two formats: as
//! page images, or a WAL records. [`wal_decoder::models::InterpretedWalRecord::from_bytes_filtered`]
//! extracts page images out of some WAL records, but mostly it's WAL
//! records. If a WAL record modifies multiple pages, WalIngest
//! will call Repository::put_rel_wal_record or put_rel_page_image functions
//! separately for each modified page.
//!
//! To reconstruct a page using a WAL record, the Repository calls the
//! code in walredo.rs. walredo.rs passes most WAL records to the WAL
//! redo Postgres process, but some records it can handle directly with
//! bespoken Rust code.
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::OnceLock;
use std::time::Duration;
use std::time::Instant;
use std::time::SystemTime;
use pageserver_api::shard::ShardIdentity;
use postgres_ffi::fsm_logical_to_physical;
use postgres_ffi::walrecord::*;
use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
use wal_decoder::models::*;
use anyhow::{bail, Result};
use bytes::{Buf, Bytes};
use tracing::*;
use utils::failpoint_support;
use utils::rate_limit::RateLimit;
use crate::context::RequestContext;
use crate::metrics::WAL_INGEST;
use crate::pgdatadir_mapping::{DatadirModification, Version};
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::PageReconstructError;
use crate::tenant::Timeline;
use crate::ZERO_PAGE;
use pageserver_api::key::rel_block_to_key;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::TransactionId;
use utils::bin_ser::SerializeError;
use utils::lsn::Lsn;
enum_pgversion! {CheckPoint, pgv::CheckPoint}
impl CheckPoint {
fn encode(&self) -> Result<Bytes, SerializeError> {
enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.encode() })
}
fn update_next_xid(&mut self, xid: u32) -> bool {
enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.update_next_xid(xid) })
}
pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool {
enum_pgversion_dispatch!(self, CheckPoint, cp, {
cp.update_next_multixid(multi_xid, multi_offset)
})
}
}
/// Temporary limitation of WAL lag warnings after attach
///
/// After tenant attach, we want to limit WAL lag warnings because
/// we don't look at the WAL until the attach is complete, which
/// might take a while.
pub struct WalLagCooldown {
/// Until when should this limitation apply at all
active_until: std::time::Instant,
/// The maximum lag to suppress. Lags above this limit get reported anyways.
max_lag: Duration,
}
impl WalLagCooldown {
pub fn new(attach_start: Instant, attach_duration: Duration) -> Self {
Self {
active_until: attach_start + attach_duration * 3 + Duration::from_secs(120),
max_lag: attach_duration * 2 + Duration::from_secs(60),
}
}
}
pub struct WalIngest {
attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
shard: ShardIdentity,
checkpoint: CheckPoint,
checkpoint_modified: bool,
warn_ingest_lag: WarnIngestLag,
}
struct WarnIngestLag {
lag_msg_ratelimit: RateLimit,
future_lsn_msg_ratelimit: RateLimit,
timestamp_invalid_msg_ratelimit: RateLimit,
}
impl WalIngest {
pub async fn new(
timeline: &Timeline,
startpoint: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<WalIngest> {
// Fetch the latest checkpoint into memory, so that we can compare with it
// quickly in `ingest_record` and update it when it changes.
let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
let pgversion = timeline.pg_version;
let checkpoint = dispatch_pgversion!(pgversion, {
let checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?;
trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
<pgv::CheckPoint as Into<CheckPoint>>::into(checkpoint)
});
Ok(WalIngest {
shard: *timeline.get_shard_identity(),
checkpoint,
checkpoint_modified: false,
attach_wal_lag_cooldown: timeline.attach_wal_lag_cooldown.clone(),
warn_ingest_lag: WarnIngestLag {
lag_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
future_lsn_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
timestamp_invalid_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
},
})
}
/// Ingest an interpreted PostgreSQL WAL record by doing writes to the underlying key value
/// storage of a given timeline.
///
/// This function updates `lsn` field of `DatadirModification`
///
/// This function returns `true` if the record was ingested, and `false` if it was filtered out
pub async fn ingest_record(
&mut self,
interpreted: InterpretedWalRecord,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<bool> {
WAL_INGEST.records_received.inc();
let prev_len = modification.len();
modification.set_lsn(interpreted.next_record_lsn)?;
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) {
// Records of this type should always be preceded by a commit(), as they
// rely on reading data pages back from the Timeline.
assert!(!modification.has_dirty_data());
}
assert!(!self.checkpoint_modified);
if interpreted.xid != pg_constants::INVALID_TRANSACTION_ID
&& self.checkpoint.update_next_xid(interpreted.xid)
{
self.checkpoint_modified = true;
}
failpoint_support::sleep_millis_async!("wal-ingest-record-sleep");
match interpreted.metadata_record {
Some(MetadataRecord::Heapam(rec)) => match rec {
HeapamRecord::ClearVmBits(clear_vm_bits) => {
self.ingest_clear_vm_bits(clear_vm_bits, modification, ctx)
.await?;
}
},
Some(MetadataRecord::Neonrmgr(rec)) => match rec {
NeonrmgrRecord::ClearVmBits(clear_vm_bits) => {
self.ingest_clear_vm_bits(clear_vm_bits, modification, ctx)
.await?;
}
},
Some(MetadataRecord::Smgr(rec)) => match rec {
SmgrRecord::Create(create) => {
self.ingest_xlog_smgr_create(create, modification, ctx)
.await?;
}
SmgrRecord::Truncate(truncate) => {
self.ingest_xlog_smgr_truncate(truncate, modification, ctx)
.await?;
}
},
Some(MetadataRecord::Dbase(rec)) => match rec {
DbaseRecord::Create(create) => {
self.ingest_xlog_dbase_create(create, modification, ctx)
.await?;
}
DbaseRecord::Drop(drop) => {
self.ingest_xlog_dbase_drop(drop, modification, ctx).await?;
}
},
Some(MetadataRecord::Clog(rec)) => match rec {
ClogRecord::ZeroPage(zero_page) => {
self.ingest_clog_zero_page(zero_page, modification, ctx)
.await?;
}
ClogRecord::Truncate(truncate) => {
self.ingest_clog_truncate(truncate, modification, ctx)
.await?;
}
},
Some(MetadataRecord::Xact(rec)) => {
self.ingest_xact_record(rec, modification, ctx).await?;
}
Some(MetadataRecord::MultiXact(rec)) => match rec {
MultiXactRecord::ZeroPage(zero_page) => {
self.ingest_multixact_zero_page(zero_page, modification, ctx)
.await?;
}
MultiXactRecord::Create(create) => {
self.ingest_multixact_create(modification, &create)?;
}
MultiXactRecord::Truncate(truncate) => {
self.ingest_multixact_truncate(modification, &truncate, ctx)
.await?;
}
},
Some(MetadataRecord::Relmap(rec)) => match rec {
RelmapRecord::Update(update) => {
self.ingest_relmap_update(update, modification, ctx).await?;
}
},
Some(MetadataRecord::Xlog(rec)) => match rec {
XlogRecord::Raw(raw) => {
self.ingest_raw_xlog_record(raw, modification, ctx).await?;
}
},
Some(MetadataRecord::LogicalMessage(rec)) => match rec {
LogicalMessageRecord::Put(put) => {
self.ingest_logical_message_put(put, modification, ctx)
.await?;
}
#[cfg(feature = "testing")]
LogicalMessageRecord::Failpoint => {
// This is a convenient way to make the WAL ingestion pause at
// particular point in the WAL. For more fine-grained control,
// we could peek into the message and only pause if it contains
// a particular string, for example, but this is enough for now.
failpoint_support::sleep_millis_async!(
"pageserver-wal-ingest-logical-message-sleep"
);
}
},
Some(MetadataRecord::Standby(rec)) => {
self.ingest_standby_record(rec).unwrap();
}
Some(MetadataRecord::Replorigin(rec)) => {
self.ingest_replorigin_record(rec, modification).await?;
}
None => {
// There are two cases through which we end up here:
// 1. The resource manager for the original PG WAL record
// is [`pg_constants::RM_TBLSPC_ID`]. This is not a supported
// record type within Neon.
// 2. The resource manager id was unknown to
// [`wal_decoder::decoder::MetadataRecord::from_decoded`].
// TODO(vlad): Tighten this up more once we build confidence
// that case (2) does not happen in the field.
}
}
modification
.ingest_batch(interpreted.batch, &self.shard, ctx)
.await?;
// If checkpoint data was updated, store the new version in the repository
if self.checkpoint_modified {
let new_checkpoint_bytes = self.checkpoint.encode()?;
modification.put_checkpoint(new_checkpoint_bytes)?;
self.checkpoint_modified = false;
}
// Note that at this point this record is only cached in the modification
// until commit() is called to flush the data into the repository and update
// the latest LSN.
Ok(modification.len() > prev_len)
}
/// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL
fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64> {
let next_full_xid =
enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value });
let next_xid = (next_full_xid) as u32;
let mut epoch = (next_full_xid >> 32) as u32;
if xid > next_xid {
// Wraparound occurred, must be from a prev epoch.
if epoch == 0 {
bail!("apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}");
}
epoch -= 1;
}
Ok((epoch as u64) << 32 | xid as u64)
}
async fn ingest_clear_vm_bits(
&mut self,
clear_vm_bits: ClearVmBits,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let ClearVmBits {
new_heap_blkno,
old_heap_blkno,
flags,
vm_rel,
} = clear_vm_bits;
// Clear the VM bits if required.
let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
// Sometimes, Postgres seems to create heap WAL records with the
// ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is
// not set. In fact, it's possible that the VM page does not exist at all.
// In that case, we don't want to store a record to clear the VM bit;
// replaying it would fail to find the previous image of the page, because
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
// record if it doesn't.
//
// TODO: analyze the metrics and tighten this up accordingly. This logic
// implicitly assumes that VM pages see explicit WAL writes before
// implicit ClearVmBits, and will otherwise silently drop updates.
let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
WAL_INGEST
.clear_vm_bits_unknown
.with_label_values(&["relation"])
.inc();
return Ok(());
};
if let Some(blknum) = new_vm_blk {
if blknum >= vm_size {
WAL_INGEST
.clear_vm_bits_unknown
.with_label_values(&["new_page"])
.inc();
new_vm_blk = None;
}
}
if let Some(blknum) = old_vm_blk {
if blknum >= vm_size {
WAL_INGEST
.clear_vm_bits_unknown
.with_label_values(&["old_page"])
.inc();
old_vm_blk = None;
}
}
if new_vm_blk.is_some() || old_vm_blk.is_some() {
if new_vm_blk == old_vm_blk {
// An UPDATE record that needs to clear the bits for both old and the
// new page, both of which reside on the same VM page.
self.put_rel_wal_record(
modification,
vm_rel,
new_vm_blk.unwrap(),
NeonWalRecord::ClearVisibilityMapFlags {
new_heap_blkno,
old_heap_blkno,
flags,
},
ctx,
)
.await?;
} else {
// Clear VM bits for one heap page, or for two pages that reside on
// different VM pages.
if let Some(new_vm_blk) = new_vm_blk {
self.put_rel_wal_record(
modification,
vm_rel,
new_vm_blk,
NeonWalRecord::ClearVisibilityMapFlags {
new_heap_blkno,
old_heap_blkno: None,
flags,
},
ctx,
)
.await?;
}
if let Some(old_vm_blk) = old_vm_blk {
self.put_rel_wal_record(
modification,
vm_rel,
old_vm_blk,
NeonWalRecord::ClearVisibilityMapFlags {
new_heap_blkno: None,
old_heap_blkno,
flags,
},
ctx,
)
.await?;
}
}
}
Ok(())
}
/// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record.
async fn ingest_xlog_dbase_create(
&mut self,
create: DbaseCreate,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let DbaseCreate {
db_id,
tablespace_id,
src_db_id,
src_tablespace_id,
} = create;
let rels = modification
.tline
.list_rels(
src_tablespace_id,
src_db_id,
Version::Modified(modification),
ctx,
)
.await?;
debug!("ingest_xlog_dbase_create: {} rels", rels.len());
// Copy relfilemap
let filemap = modification
.tline
.get_relmap_file(
src_tablespace_id,
src_db_id,
Version::Modified(modification),
ctx,
)
.await?;
modification
.put_relmap_file(tablespace_id, db_id, filemap, ctx)
.await?;
let mut num_rels_copied = 0;
let mut num_blocks_copied = 0;
for src_rel in rels {
assert_eq!(src_rel.spcnode, src_tablespace_id);
assert_eq!(src_rel.dbnode, src_db_id);
let nblocks = modification
.tline
.get_rel_size(src_rel, Version::Modified(modification), ctx)
.await?;
let dst_rel = RelTag {
spcnode: tablespace_id,
dbnode: db_id,
relnode: src_rel.relnode,
forknum: src_rel.forknum,
};
modification.put_rel_creation(dst_rel, nblocks, ctx).await?;
// Copy content
debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks);
for blknum in 0..nblocks {
// Sharding:
// - src and dst are always on the same shard, because they differ only by dbNode, and
// dbNode is not included in the hash inputs for sharding.
// - This WAL command is replayed on all shards, but each shard only copies the blocks
// that belong to it.
let src_key = rel_block_to_key(src_rel, blknum);
if !self.shard.is_key_local(&src_key) {
debug!(
"Skipping non-local key {} during XLOG_DBASE_CREATE",
src_key
);
continue;
}
debug!(
"copying block {} from {} ({}) to {}",
blknum, src_rel, src_key, dst_rel
);
let content = modification
.tline
.get_rel_page_at_lsn(src_rel, blknum, Version::Modified(modification), ctx)
.await?;
modification.put_rel_page_image(dst_rel, blknum, content)?;
num_blocks_copied += 1;
}
num_rels_copied += 1;
}
info!(
"Created database {}/{}, copied {} blocks in {} rels",
tablespace_id, db_id, num_blocks_copied, num_rels_copied
);
Ok(())
}
async fn ingest_xlog_dbase_drop(
&mut self,
dbase_drop: DbaseDrop,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let DbaseDrop {
db_id,
tablespace_ids,
} = dbase_drop;
for tablespace_id in tablespace_ids {
trace!("Drop db {}, {}", tablespace_id, db_id);
modification.drop_dbdir(tablespace_id, db_id, ctx).await?;
}
Ok(())
}
async fn ingest_xlog_smgr_create(
&mut self,
create: SmgrCreate,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let SmgrCreate { rel } = create;
self.put_rel_creation(modification, rel, ctx).await?;
Ok(())
}
/// Subroutine of ingest_record(), to handle an XLOG_SMGR_TRUNCATE record.
///
/// This is the same logic as in PostgreSQL's smgr_redo() function.
async fn ingest_xlog_smgr_truncate(
&mut self,
truncate: XlSmgrTruncate,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let XlSmgrTruncate {
blkno,
rnode,
flags,
} = truncate;
let spcnode = rnode.spcnode;
let dbnode = rnode.dbnode;
let relnode = rnode.relnode;
if flags & pg_constants::SMGR_TRUNCATE_HEAP != 0 {
let rel = RelTag {
spcnode,
dbnode,
relnode,
forknum: MAIN_FORKNUM,
};
self.put_rel_truncation(modification, rel, blkno, ctx)
.await?;
}
if flags & pg_constants::SMGR_TRUNCATE_FSM != 0 {
let rel = RelTag {
spcnode,
dbnode,
relnode,
forknum: FSM_FORKNUM,
};
let fsm_logical_page_no = blkno / pg_constants::SLOTS_PER_FSM_PAGE;
let mut fsm_physical_page_no = fsm_logical_to_physical(fsm_logical_page_no);
if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
// Tail of last remaining FSM page has to be zeroed.
// We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
fsm_physical_page_no += 1;
}
// TODO: re-examine the None case here wrt. sharding; should we error?
let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
if nblocks > fsm_physical_page_no {
// check if something to do: FSM is larger than truncate position
self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
.await?;
}
}
if flags & pg_constants::SMGR_TRUNCATE_VM != 0 {
let rel = RelTag {
spcnode,
dbnode,
relnode,
forknum: VISIBILITYMAP_FORKNUM,
};
// last remaining block, byte, and bit
let mut vm_page_no = blkno / (pg_constants::VM_HEAPBLOCKS_PER_PAGE as u32);
let trunc_byte = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_PAGE
/ pg_constants::VM_HEAPBLOCKS_PER_BYTE;
let trunc_offs = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_BYTE
* pg_constants::VM_BITS_PER_HEAPBLOCK;
// Unless the new size is exactly at a visibility map page boundary, the
// tail bits in the last remaining map page, representing truncated heap
// blocks, need to be cleared. This is not only tidy, but also necessary
// because we don't get a chance to clear the bits if the heap is extended
// again.
if (trunc_byte != 0 || trunc_offs != 0)
&& self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no))
{
modification.put_rel_wal_record(
rel,
vm_page_no,
NeonWalRecord::TruncateVisibilityMap {
trunc_byte,
trunc_offs,
},
)?;
vm_page_no += 1;
}
// TODO: re-examine the None case here wrt. sharding; should we error?
let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
if nblocks > vm_page_no {
// check if something to do: VM is larger than truncate position
self.put_rel_truncation(modification, rel, vm_page_no, ctx)
.await?;
}
}
Ok(())
}
fn warn_on_ingest_lag(
&mut self,
conf: &crate::config::PageServerConf,
wal_timestamp: TimestampTz,
) {
debug_assert_current_span_has_tenant_and_timeline_id();
let now = SystemTime::now();
let rate_limits = &mut self.warn_ingest_lag;
let ts = enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, _cp, {
pgv::xlog_utils::try_from_pg_timestamp(wal_timestamp)
});
match ts {
Ok(ts) => {
match now.duration_since(ts) {
Ok(lag) => {
if lag > conf.wait_lsn_timeout {
rate_limits.lag_msg_ratelimit.call2(|rate_limit_stats| {
if let Some(cooldown) = self.attach_wal_lag_cooldown.get() {
if std::time::Instant::now() < cooldown.active_until && lag <= cooldown.max_lag {
return;
}
} else {
// Still loading? We shouldn't be here
}
let lag = humantime::format_duration(lag);
warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout");
})
}
}
Err(e) => {
let delta_t = e.duration();
// determined by prod victoriametrics query: 1000 * (timestamp(node_time_seconds{neon_service="pageserver"}) - node_time_seconds)
// => https://www.robustperception.io/time-metric-from-the-node-exporter/
const IGNORED_DRIFT: Duration = Duration::from_millis(100);
if delta_t > IGNORED_DRIFT {
let delta_t = humantime::format_duration(delta_t);
rate_limits.future_lsn_msg_ratelimit.call2(|rate_limit_stats| {
warn!(%rate_limit_stats, %delta_t, "ingesting record with timestamp from future");
})
}
}
};
}
Err(error) => {
rate_limits.timestamp_invalid_msg_ratelimit.call2(|rate_limit_stats| {
warn!(%rate_limit_stats, %error, "ingesting record with invalid timestamp, cannot calculate lag and will fail find-lsn-for-timestamp type queries");
})
}
}
}
/// Subroutine of ingest_record(), to handle an XLOG_XACT_* records.
///
async fn ingest_xact_record(
&mut self,
record: XactRecord,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let (xact_common, is_commit, is_prepared) = match record {
XactRecord::Prepare(XactPrepare { xl_xid, data }) => {
let xid: u64 = if modification.tline.pg_version >= 17 {
self.adjust_to_full_transaction_id(xl_xid)?
} else {
xl_xid as u64
};
return modification.put_twophase_file(xid, data, ctx).await;
}
XactRecord::Commit(common) => (common, true, false),
XactRecord::Abort(common) => (common, false, false),
XactRecord::CommitPrepared(common) => (common, true, true),
XactRecord::AbortPrepared(common) => (common, false, true),
};
let XactCommon {
parsed,
origin_id,
xl_xid,
lsn,
} = xact_common;
// Record update of CLOG pages
let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
let mut segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let mut rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
let mut page_xids: Vec<TransactionId> = vec![parsed.xid];
self.warn_on_ingest_lag(modification.tline.conf, parsed.xact_time);
for subxact in &parsed.subxacts {
let subxact_pageno = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
if subxact_pageno != pageno {
// This subxact goes to different page. Write the record
// for all the XIDs on the previous page, and continue
// accumulating XIDs on this new page.
modification.put_slru_wal_record(
SlruKind::Clog,
segno,
rpageno,
if is_commit {
NeonWalRecord::ClogSetCommitted {
xids: page_xids,
timestamp: parsed.xact_time,
}
} else {
NeonWalRecord::ClogSetAborted { xids: page_xids }
},
)?;
page_xids = Vec::new();
}
pageno = subxact_pageno;
segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
page_xids.push(*subxact);
}
modification.put_slru_wal_record(
SlruKind::Clog,
segno,
rpageno,
if is_commit {
NeonWalRecord::ClogSetCommitted {
xids: page_xids,
timestamp: parsed.xact_time,
}
} else {
NeonWalRecord::ClogSetAborted { xids: page_xids }
},
)?;
// Group relations to drop by dbNode. This map will contain all relations that _might_
// exist, we will reduce it to which ones really exist later. This map can be huge if
// the transaction touches a huge number of relations (there is no bound on this in
// postgres).
let mut drop_relations: HashMap<(u32, u32), Vec<RelTag>> = HashMap::new();
for xnode in &parsed.xnodes {
for forknum in MAIN_FORKNUM..=INIT_FORKNUM {
let rel = RelTag {
forknum,
spcnode: xnode.spcnode,
dbnode: xnode.dbnode,
relnode: xnode.relnode,
};
drop_relations
.entry((xnode.spcnode, xnode.dbnode))
.or_default()
.push(rel);
}
}
// Execute relation drops in a batch: the number may be huge, so deleting individually is prohibitively expensive
modification.put_rel_drops(drop_relations, ctx).await?;
if origin_id != 0 {
modification
.set_replorigin(origin_id, parsed.origin_lsn)
.await?;
}
if is_prepared {
// Remove twophase file. see RemoveTwoPhaseFile() in postgres code
trace!(
"Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
xl_xid,
parsed.xid,
lsn,
);
let xid: u64 = if modification.tline.pg_version >= 17 {
self.adjust_to_full_transaction_id(parsed.xid)?
} else {
parsed.xid as u64
};
modification.drop_twophase_file(xid, ctx).await?;
}
Ok(())
}
async fn ingest_clog_truncate(
&mut self,
truncate: ClogTruncate,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let ClogTruncate {
pageno,
oldest_xid,
oldest_xid_db,
} = truncate;
info!(
"RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}",
pageno, oldest_xid, oldest_xid_db
);
// In Postgres, oldestXid and oldestXidDB are updated in memory when the CLOG is
// truncated, but a checkpoint record with the updated values isn't written until
// later. In Neon, a server can start at any LSN, not just on a checkpoint record,
// so we keep the oldestXid and oldestXidDB up-to-date.
enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
cp.oldestXid = oldest_xid;
cp.oldestXidDB = oldest_xid_db;
});
self.checkpoint_modified = true;
// TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it
let latest_page_number =
enum_pgversion_dispatch!(self.checkpoint, CheckPoint, cp, { cp.nextXid.value }) as u32
/ pg_constants::CLOG_XACTS_PER_PAGE;
// Now delete all segments containing pages between xlrec.pageno
// and latest_page_number.
// First, make an important safety check:
// the current endpoint page must not be eligible for removal.
// See SimpleLruTruncate() in slru.c
if dispatch_pgversion!(modification.tline.pg_version, {
pgv::nonrelfile_utils::clogpage_precedes(latest_page_number, pageno)
}) {
info!("could not truncate directory pg_xact apparent wraparound");
return Ok(());
}
// Iterate via SLRU CLOG segments and drop segments that we're ready to truncate
//
// We cannot pass 'lsn' to the Timeline.list_nonrels(), or it
// will block waiting for the last valid LSN to advance up to
// it. So we use the previous record's LSN in the get calls
// instead.
for segno in modification
.tline
.list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
.await?
{
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
});
if may_delete {
modification
.drop_slru_segment(SlruKind::Clog, segno, ctx)
.await?;
trace!("Drop CLOG segment {:>04X}", segno);
}
}
Ok(())
}
async fn ingest_clog_zero_page(
&mut self,
zero_page: ClogZeroPage,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let ClogZeroPage { segno, rpageno } = zero_page;
self.put_slru_page_image(
modification,
SlruKind::Clog,
segno,
rpageno,
ZERO_PAGE.clone(),
ctx,
)
.await
}
fn ingest_multixact_create(
&mut self,
modification: &mut DatadirModification,
xlrec: &XlMultiXactCreate,
) -> Result<()> {
// Create WAL record for updating the multixact-offsets page
let pageno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
modification.put_slru_wal_record(
SlruKind::MultiXactOffsets,
segno,
rpageno,
NeonWalRecord::MultixactOffsetCreate {
mid: xlrec.mid,
moff: xlrec.moff,
},
)?;
// Create WAL records for the update of each affected multixact-members page
let mut members = xlrec.members.iter();
let mut offset = xlrec.moff;
loop {
let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
// How many members fit on this page?
let page_remain = pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32
- offset % pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
let mut this_page_members: Vec<MultiXactMember> = Vec::new();
for _ in 0..page_remain {
if let Some(m) = members.next() {
this_page_members.push(m.clone());
} else {
break;
}
}
if this_page_members.is_empty() {
// all done
break;
}
let n_this_page = this_page_members.len();
modification.put_slru_wal_record(
SlruKind::MultiXactMembers,
pageno / pg_constants::SLRU_PAGES_PER_SEGMENT,
pageno % pg_constants::SLRU_PAGES_PER_SEGMENT,
NeonWalRecord::MultixactMembersCreate {
moff: offset,
members: this_page_members,
},
)?;
// Note: The multixact members can wrap around, even within one WAL record.
offset = offset.wrapping_add(n_this_page as u32);
}
let next_offset = offset;
assert!(xlrec.moff.wrapping_add(xlrec.nmembers) == next_offset);
// Update next-multi-xid and next-offset
//
// NB: In PostgreSQL, the next-multi-xid stored in the control file is allowed to
// go to 0, and it's fixed up by skipping to FirstMultiXactId in functions that
// read it, like GetNewMultiXactId(). This is different from how nextXid is
// incremented! nextXid skips over < FirstNormalTransactionId when the the value
// is stored, so it's never 0 in a checkpoint.
//
// I don't know why it's done that way, it seems less error-prone to skip over 0
// when the value is stored rather than when it's read. But let's do it the same
// way here.
let next_multi_xid = xlrec.mid.wrapping_add(1);
if self
.checkpoint
.update_next_multixid(next_multi_xid, next_offset)
{
self.checkpoint_modified = true;
}
// Also update the next-xid with the highest member. According to the comments in
// multixact_redo(), this shouldn't be necessary, but let's do the same here.