-
-
Notifications
You must be signed in to change notification settings - Fork 502
/
Copy pathsource.rs
661 lines (608 loc) · 29.8 KB
/
source.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
#![expect(clippy::unnecessary_safety_comment)]
use std::{marker::PhantomData, slice, str};
use crate::{UniquePromise, MAX_LEN};
use super::search::SEARCH_BATCH_SIZE;
/// `Source` holds the source text for the lexer, and provides APIs to read it.
///
/// It provides a cursor which allows consuming source text either as `char`s, or as bytes.
/// It replaces `std::str::Chars` iterator which performed the same function previously,
/// but was less flexible as only allowed consuming source char by char.
///
/// Consuming source text byte-by-byte is often more performant than char-by-char.
///
/// `Source` provides:
///
/// * Safe API for consuming source char-by-char (`Source::next_char`, `Source::peek_char`).
/// * Safe API for peeking next source byte (`Source::peek_byte`).
/// * Unsafe API for consuming source byte-by-byte (`Source::next_byte`).
/// * Mostly-safe API for rewinding to a previous position in source
/// (`Source::position`, `Source::set_position`).
///
/// # Composition of `Source`
///
/// * `start` is pointer to start of source text.
/// * `end` is pointer to end of source text.
/// * `ptr` is cursor for current position in source text.
///
/// # Invariants of `Source`
///
/// 1. `start` <= `end`
/// 2. The region of memory bounded between `start` and `end` must be initialized,
/// a single allocation, and contain the bytes of a valid UTF-8 string.
/// 3. `ptr` must always be >= `start` and <= `end`.
/// i.e. cursor always within bounds of source text `&str`, or 1 byte after last byte
/// of source text (positioned on EOF).
/// 4. `ptr` must always point to a UTF-8 character boundary, or EOF.
/// i.e. pointing to *1st* byte of a UTF-8 character.
///
/// These invariants are the same as `std::str::Chars`, except `Source` allows temporarily
/// breaking invariant (4) to step through source text byte-by-byte.
///
/// Invariants (1), (2) and (3) must be upheld at all times.
/// Invariant (4) can be temporarily broken, as long as caller ensures it's satisfied again.
///
/// Invariants (1) and (2) are enforced by initializing `start` and `end` from a valid `&str`,
/// and they are never modified after initialization.
///
/// Safe methods of `Source` enforce invariant (3) i.e. they do not allow reading past EOF.
/// Unsafe methods e.g. `Source::next_byte_unchecked` and `Source::peek_byte_unchecked`
/// require caller to uphold this invariant.
///
/// Invariant (4) is the most difficult to satisfy.
/// `Source::next_char` relies on source text being valid UTF-8 to provide a safe API which
/// upholds this invariant.
/// `Source::next_byte` requires very careful use as it may violate invariant (4).
/// That is fine temporarily, but caller *must* ensure the safety conditions of `Source::next_byte`
/// are satisfied, to restore this invariant before passing control back to other code.
/// It will often be preferable to instead use `Source::peek_byte`, followed by `Source::next_char`,
/// which are safe methods, and compiler will often reduce to equally efficient code.
pub(super) struct Source<'a> {
/// Pointer to start of source string. Never altered after initialization.
start: *const u8,
/// Pointer to end of source string. Never altered after initialization.
end: *const u8,
/// Pointer to current position in source string
ptr: *const u8,
/// Memory address past which not enough bytes remaining in source to process a batch of
/// `SEARCH_BATCH_SIZE` bytes in one go.
/// Must be `usize`, not a pointer, as if source is very short, a pointer could be out of bounds.
end_for_batch_search_addr: usize,
/// Marker for immutable borrow of source string
_marker: PhantomData<&'a str>,
}
impl<'a> Source<'a> {
/// Create `Source` from `&str`.
///
/// Requiring a `UniquePromise` to be provided guarantees only 1 `Source` can exist
/// on a single thread at one time.
#[expect(unused_variables, clippy::needless_pass_by_value)]
pub(super) fn new(mut source_text: &'a str, unique: UniquePromise) -> Self {
// If source text exceeds size limit, substitute a short source text which will fail to parse.
// `Parser::parse` will convert error to `diagnostics::overlong_source()`.
if source_text.len() > MAX_LEN {
source_text = "\0";
}
let start = source_text.as_ptr();
// SAFETY: Adding `source_text.len()` to the starting pointer gives a pointer
// at the end of `source_text`. `end` will never be dereferenced, only checked
// for direct pointer equality with `ptr` to check if at end of file.
let end = unsafe { start.add(source_text.len()) };
// `saturating_sub` not `wrapping_sub` so that value doesn't wrap around if source
// is very short, and has very low memory address (e.g. 16). If that's the case,
// `end_for_batch_search_addr` will be 0, so a test whether any non-null pointer is past end
// will always test positive, and disable batch search.
let end_for_batch_search_addr = (end as usize).saturating_sub(SEARCH_BATCH_SIZE);
Self { start, end, ptr: start, end_for_batch_search_addr, _marker: PhantomData }
}
/// Get entire source text as `&str`.
#[inline]
pub(super) fn whole(&self) -> &'a str {
// SAFETY:
// `start` and `end` are created from a `&str` in `Source::new`, so `start` cannot be after `end`.
// `start` and `end` are by definition on UTF-8 char boundaries.
unsafe {
self.str_between_positions_unchecked(
SourcePosition::new(self.start),
SourcePosition::new(self.end),
)
}
}
/// Get remaining source text as `&str`.
#[inline]
pub(super) fn remaining(&self) -> &'a str {
// SAFETY:
// Invariant of `Source` is that `ptr` is always <= `end`, and is on a UTF-8 char boundary.
// `end` is pointer to end of original `&str`, so be definition a UTF-8 char boundary.
unsafe {
self.str_between_positions_unchecked(
SourcePosition::new(self.ptr),
SourcePosition::new(self.end),
)
}
}
/// Return whether at end of source.
#[inline]
pub(super) fn is_eof(&self) -> bool {
self.ptr == self.end
}
/// Get end address.
#[inline]
pub(super) fn end_addr(&self) -> usize {
self.end as usize
}
/// Get last memory address at which a batch of `Lexer::search::SEARCH_BATCH_SIZE` bytes
/// can be read without going out of bounds.
#[inline]
pub(super) fn end_for_batch_search_addr(&self) -> usize {
self.end_for_batch_search_addr
}
/// Get current position.
///
/// The `SourcePosition` returned is guaranteed to be within bounds of `&str` that `Source`
/// was created from, and on a UTF-8 character boundary, so can be used by caller
/// to later move current position of this `Source` using `Source::set_position`.
///
/// `SourcePosition` lives as long as the source text `&str` that `Source` was created from.
#[inline]
pub(super) fn position(&self) -> SourcePosition<'a> {
// SAFETY: Creating a `SourcePosition` from current position of `Source` is always valid,
// if caller has upheld safety conditions of other unsafe methods of this type.
unsafe { SourcePosition::new(self.ptr) }
}
/// Move current position.
#[inline]
pub(super) fn set_position(&mut self, pos: SourcePosition<'a>) {
// `SourcePosition` always upholds the invariants of `Source`, as long as it's created
// from this `Source`. `SourcePosition`s can only be created from a `Source`.
// `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source`
// in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another
// `Source` originated on another thread can "jump" onto this one.
// This is sufficient to guarantee that any `SourcePosition` that parser/lexer holds must be
// from this `Source`.
// This guarantee is what allows this function to be safe.
// SAFETY: `SourcePosition::read`'s contract is upheld by:
// * The preceding checks that `pos.ptr` >= `self.start` and < `self.end`.
// * `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory.
// * `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `pos.ptr`
// addresses cannot be aliased by a `&mut` ref as long as `Source` exists.
// * `SourcePosition` can only live as long as the `&str` underlying `Source`.
debug_assert!(
pos.ptr >= self.start
&& pos.ptr <= self.end
// SAFETY: See above
&& (pos.ptr == self.end || !is_utf8_cont_byte(unsafe { pos.read() }))
);
self.ptr = pos.ptr;
}
/// Advance `Source`'s cursor to end.
#[inline]
pub(super) fn advance_to_end(&mut self) {
self.ptr = self.end;
}
/// Advance `Source`'s cursor by one byte if it is equal to the given ASCII value.
///
/// # SAFETY
///
/// Caller must ensure that `ascii_byte` is a valid ASCII character.
#[inline]
pub(super) unsafe fn advance_if_ascii_eq(&mut self, ascii_byte: u8) -> bool {
debug_assert!(ascii_byte.is_ascii());
let matched = self.peek_byte() == Some(ascii_byte);
if matched {
// SAFETY: next byte exists and is a valid ASCII char (and thus UTF-8
// char boundary).
self.ptr = unsafe { self.ptr.add(1) };
}
matched
}
/// Get string slice from a `SourcePosition` up to the current position of `Source`.
pub(super) fn str_from_pos_to_current(&self, pos: SourcePosition<'a>) -> &'a str {
assert!(pos.ptr <= self.ptr);
// SAFETY: The above assertion satisfies `str_from_pos_to_current_unchecked`'s requirements
unsafe { self.str_from_pos_to_current_unchecked(pos) }
}
/// Get string slice from a `SourcePosition` up to current position of `Source`, without checks.
///
/// # SAFETY
/// `pos` must not be after current position of `Source`.
/// This is always the case if both:
/// 1. `Source::set_position` has not been called since `pos` was created.
/// 2. `pos` has not been advanced with `SourcePosition::add`.
#[inline]
pub(super) unsafe fn str_from_pos_to_current_unchecked(
&self,
pos: SourcePosition<'a>,
) -> &'a str {
// SAFETY: Caller guarantees `pos` is not after current position of `Source`.
// `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
self.str_between_positions_unchecked(pos, SourcePosition::new(self.ptr))
}
/// Get string slice from current position of `Source` up to a `SourcePosition`, without checks.
///
/// # SAFETY
/// `pos` must not be before current position of `Source`.
/// This is always the case if both:
/// 1. `Source::set_position` has not been called since `pos` was created.
/// 2. `pos` has not been moved backwards with `SourcePosition::sub`.
#[inline]
pub(super) unsafe fn str_from_current_to_pos_unchecked(
&self,
pos: SourcePosition<'a>,
) -> &'a str {
// SAFETY: Caller guarantees `pos` is not before current position of `Source`.
// `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
self.str_between_positions_unchecked(SourcePosition::new(self.ptr), pos)
}
/// Get string slice from a `SourcePosition` up to the end of `Source`.
#[inline]
pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition<'a>) -> &'a str {
// SAFETY: Invariants of `SourcePosition` is that it cannot be after end of `Source`,
// and always on a UTF-8 character boundary.
// `self.end` is always a valid `SourcePosition` due to invariants of `Source`.
unsafe { self.str_between_positions_unchecked(pos, SourcePosition::new(self.end)) }
}
/// Get string slice of source between 2 `SourcePosition`s, without checks.
///
/// # SAFETY
/// `start` must not be after `end`.
#[inline]
pub(super) unsafe fn str_between_positions_unchecked(
&self,
start: SourcePosition<'a>,
end: SourcePosition<'a>,
) -> &'a str {
// Check `start` is not after `end`
debug_assert!(start.ptr <= end.ptr);
// Check `start` and `end` are within bounds of `Source`
debug_assert!(start.ptr >= self.start);
debug_assert!(end.ptr <= self.end);
// Check `start` and `end` are on UTF-8 character boundaries.
// SAFETY: Above assertions ensure `start` and `end` are valid to read from if not at EOF.
debug_assert!(start.ptr == self.end || !is_utf8_cont_byte(start.read()));
debug_assert!(end.ptr == self.end || !is_utf8_cont_byte(end.read()));
// SAFETY: Caller guarantees `start` is not after `end`.
// `SourcePosition`s can only be created from a `Source`.
// `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source`
// in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another
// `Source` originated on another thread can "jump" onto this one.
// This is sufficient to guarantee that any `SourcePosition` that parser/lexer holds must be
// from this `Source`, therefore `start.ptr` and `end.ptr` must both be within the same
// allocation, and derived from the same original pointer.
// Invariants of `Source` and `SourcePosition` types guarantee that both are positioned
// on UTF-8 character boundaries. So slicing source text between these 2 points will always
// yield a valid UTF-8 string.
let len = end.addr() - start.addr();
let slice = slice::from_raw_parts(start.ptr, len);
std::str::from_utf8_unchecked(slice)
}
/// Get current position in source, relative to start of source.
#[inline]
pub(super) fn offset(&self) -> u32 {
self.offset_of(self.position())
}
/// Get offset of `pos`.
#[expect(clippy::cast_possible_truncation)]
#[inline]
pub(super) fn offset_of(&self, pos: SourcePosition<'a>) -> u32 {
// Cannot overflow `u32` because of `MAX_LEN` check in `Source::new`
(pos.addr() - self.start as usize) as u32
}
/// Move current position back by `n` bytes.
///
/// # Panic
/// Panics if:
/// * `n` is 0.
/// * `n` is greater than current offset in source.
/// * Moving back `n` bytes would not place current position on a UTF-8 character boundary.
#[inline]
pub(super) fn back(&mut self, n: usize) {
// This assertion is essential to ensure safety of `new_pos.read()` call below.
// Without this check, calling `back(0)` on an empty `Source` would cause reading
// out of bounds.
// Compiler should remove this assertion when inlining this function,
// as long as it can deduce from calling code that `n` is non-zero.
assert!(n > 0, "Cannot call `Source::back` with 0");
// Ensure not attempting to go back to before start of source
let offset = self.ptr as usize - self.start as usize;
assert!(n <= offset, "Cannot go back {n} bytes - only {offset} bytes consumed");
// SAFETY: We have checked that `n` is less than distance between `start` and `ptr`,
// so `new_ptr` cannot be outside of allocation of original `&str`
let new_pos = unsafe { self.position().sub(n) };
// Enforce invariant that `ptr` must be positioned on a UTF-8 character boundary.
// SAFETY: `new_ptr` is in bounds of original `&str`, and `n > 0` assertion ensures
// not at the end, so valid to read a byte.
// `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory.
// `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `new_ptr`
// addresses cannot be aliased by a `&mut` ref as long as `Source` exists.
let byte = unsafe { new_pos.read() };
assert!(!is_utf8_cont_byte(byte), "Offset is not on a UTF-8 character boundary");
// Move current position. The checks above satisfy `Source`'s invariants.
self.ptr = new_pos.ptr;
}
/// Get next char of source, and advance position to after it.
#[inline]
pub(super) fn next_char(&mut self) -> Option<char> {
// Check not at EOF and handle ASCII bytes
let byte = self.peek_byte()?;
if byte.is_ascii() {
// SAFETY: We already exited if at EOF, so `ptr < end`.
// So incrementing `ptr` cannot result in `ptr > end`.
// Current byte is ASCII, so incremented `ptr` must be on a UTF-8 character boundary.
unsafe { self.ptr = self.ptr.add(1) };
return Some(byte as char);
}
// Multi-byte Unicode character.
// Check invariant that `ptr` is on a UTF-8 character boundary.
debug_assert!(!is_utf8_cont_byte(byte));
// Create a `Chars` iterator, get next char from it, and then update `self.ptr`
// to match `Chars` iterator's updated pointer afterwards.
// `Chars` iterator upholds same invariants as `Source`, so its pointer is guaranteed
// to be valid as `self.ptr`.
let mut chars = self.remaining().chars();
// SAFETY: We know that there's a byte to be consumed, so `chars.next()` must return `Some(_)`
let c = unsafe { chars.next().unwrap_unchecked() };
self.ptr = chars.as_str().as_ptr();
Some(c)
}
/// Get next 2 chars of source, and advance position to after them.
#[inline]
pub(super) fn next_2_chars(&mut self) -> Option<[char; 2]> {
// Check not at EOF and handle if 2 x ASCII bytes
let [byte1, byte2] = self.peek_2_bytes()?;
if byte1.is_ascii() && byte2.is_ascii() {
// SAFETY: We just checked that there are at least 2 bytes remaining,
// and next 2 bytes are ASCII, so advancing by 2 bytes must put `ptr`
// in bounds and on a UTF-8 character boundary
unsafe { self.ptr = self.ptr.add(2) };
return Some([byte1 as char, byte2 as char]);
}
// Multi-byte Unicode character.
// Check invariant that `ptr` is on a UTF-8 character boundary.
debug_assert!(!is_utf8_cont_byte(byte1));
// Create a `Chars` iterator, get next 2 chars from it, and then update `self.ptr`
// to match `Chars` iterator's updated pointer afterwards.
// `Chars` iterator upholds same invariants as `Source`, so its pointer is guaranteed
// to be valid as `self.ptr`.
let mut chars = self.remaining().chars();
// SAFETY: We know that there's 2 bytes to be consumed, so first call to
// `chars.next()` must return `Some(_)`
let c1 = unsafe { chars.next().unwrap_unchecked() };
let c2 = chars.next()?;
self.ptr = chars.as_str().as_ptr();
Some([c1, c2])
}
/// Get next byte of source, and advance position to after it.
///
/// # SAFETY
/// This function may leave `Source` positioned in middle of a UTF-8 character sequence,
/// which would violate one of `Source`'s invariants.
///
/// This is OK temporarily, but caller *must* ensure the invariant is restored again.
///
/// Caller must ensure one of:
///
/// 1. No byte is returned (end of file).
/// 2. The byte returned is ASCII.
/// 3. Further calls to `Source::next_byte` or `Source::next_byte_unchecked` are made
/// to consume the rest of the multi-byte UTF-8 character, before calling any other methods
/// of `Source` (even safe methods) which rely on `Source` being positioned on a UTF-8
/// character boundary, or before passing control back to other safe code which may call them.
///
/// In particular, safe methods `Source::next_char`, `Source::peek_char`, and `Source::remaining`
/// are *not* safe to call until one of above conditions is satisfied.
///
/// It will often be preferable to instead use `Source::peek_byte`, followed by `Source::next_char`,
/// which are safe methods, and compiler will often reduce to equally efficient code, if calling
/// code tests the byte returned. e.g.:
///
/// ```
/// // Consume a space
/// let byte = source.peek_byte();
/// if byte == Some(b' ') {
/// source.next_char().unwrap();
/// }
/// ```
#[expect(dead_code)]
#[inline]
unsafe fn next_byte(&mut self) -> Option<u8> {
#[expect(clippy::if_not_else)] // Hot path first
if !self.is_eof() {
// SAFETY: Safe to read from `ptr` as we just checked it's not out of bounds
Some(self.next_byte_unchecked())
} else {
None
}
}
/// Get next bytes of source, and advance position to after it, without EOF bounds-check.
///
/// # SAFETY
/// Caller must ensure `Source` is not at end of file.
///
/// This function may leave `Source` positioned in middle of a UTF-8 character sequence,
/// which would violate one of `Source`'s invariants.
///
/// This is OK temporarily, but caller *must* ensure the invariant is restored again.
///
/// Caller must ensure one of:
///
/// 1. The byte returned is ASCII.
/// 2. Further calls to `Source::next_byte` or `Source::next_byte_unchecked` are made
/// to consume the rest of the multi-byte UTF-8 character, before calling any other methods
/// of `Source` (even safe methods) which rely on `Source` being positioned on a UTF-8
/// character boundary, or before passing control back to other safe code which may call them.
///
/// In particular, safe methods `Source::next_char`, `Source::peek_char`, and `Source::remaining`
/// are *not* safe to call until one of above conditions is satisfied.
#[inline]
pub(super) unsafe fn next_byte_unchecked(&mut self) -> u8 {
// SAFETY: Caller guarantees not at end of file i.e. `ptr != end`.
// Methods of this type provide no way for `ptr` to be before `start` or after `end`.
// Therefore always valid to read a byte from `ptr`, and incrementing `ptr` cannot result
// in `ptr > end`.
let byte = self.peek_byte_unchecked();
self.ptr = self.ptr.add(1);
byte
}
/// Peek next char of source, without consuming it.
#[inline]
pub(super) fn peek_char(&self) -> Option<char> {
// Check not at EOF and handle ASCII bytes
let byte = self.peek_byte()?;
if byte.is_ascii() {
return Some(byte as char);
}
// Multi-byte Unicode character.
// Check invariant that `ptr` is on a UTF-8 character boundary.
debug_assert!(!is_utf8_cont_byte(byte));
// Create a `Chars` iterator, and get next char from it
let mut chars = self.remaining().chars();
// SAFETY: We know that there's a byte to be consumed, so `chars.next()` must return `Some(_)`.
// Could just return `chars.next()` here, but making it clear to compiler that this branch
// always returns `Some(_)` may help it optimize the caller. Compiler seems to have difficulty
// "seeing into" `Chars` iterator and making deductions.
let c = unsafe { chars.next().unwrap_unchecked() };
Some(c)
}
/// Peek next byte of source without consuming it.
#[inline]
pub(super) fn peek_byte(&self) -> Option<u8> {
#[expect(clippy::if_not_else)] // Hot path first
if !self.is_eof() {
// SAFETY: Safe to read from `ptr` as we just checked it's not out of bounds
Some(unsafe { self.peek_byte_unchecked() })
} else {
None
}
}
/// Peek next two bytes of source without consuming them.
#[inline]
pub(super) fn peek_2_bytes(&self) -> Option<[u8; 2]> {
// `end` is always >= `ptr` so `end - ptr` cannot wrap around.
// No need to use checked/saturating subtraction here.
if (self.end as usize) - (self.ptr as usize) >= 2 {
// SAFETY: The check above ensures that there are at least 2 bytes to
// read from `self.ptr` without reading past `self.end`
let bytes = unsafe { self.position().read2() };
Some(bytes)
} else {
None
}
}
/// Peek next byte of source without consuming it, without EOF bounds-check.
///
/// # SAFETY
/// Caller must ensure `Source` is not at end of file.
#[inline]
pub(super) unsafe fn peek_byte_unchecked(&self) -> u8 {
// SAFETY: Caller guarantees `ptr` is before `end` (i.e. not at end of file).
// Methods of this type provide no way to allow `ptr` to be before `start`.
// `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory.
// `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `self.ptr`
// addresses cannot be aliased by a `&mut` ref as long as `Source` exists.
debug_assert!(self.ptr >= self.start && self.ptr < self.end);
self.position().read()
}
}
/// Wrapper around a pointer to a position in `Source`.
///
/// # SAFETY
/// `SourcePosition` must always be on a UTF-8 character boundary,
/// and within bounds of the `Source` that created it.
#[derive(Debug, Clone, Copy)]
pub struct SourcePosition<'a> {
ptr: *const u8,
_marker: PhantomData<&'a u8>,
}
impl SourcePosition<'_> {
/// Create a new `SourcePosition` from a pointer.
///
/// # SAFETY
/// * Pointer must obey all the same invariants as `Source::ptr`.
/// * It must be created from a `Source`.
/// * It must be in bounds of the source text `&str` the `Source` is created from,
/// or 1 byte after the end of the source text (i.e. positioned at EOF).
/// * It must be positioned on a UTF-8 character boundary (or EOF).
#[inline]
pub(super) unsafe fn new(ptr: *const u8) -> Self {
Self { ptr, _marker: PhantomData }
}
/// Get memory address of `SourcePosition` as a `usize`.
#[inline]
pub(super) fn addr(self) -> usize {
self.ptr as usize
}
/// Create new `SourcePosition` which is `n` bytes after this one.
/// The provenance of the pointer `SourcePosition` contains is maintained.
///
/// # SAFETY
/// Caller must ensure that advancing `SourcePosition` by `n` bytes does not make it past the end
/// of `Source` this `SourcePosition` was created from.
/// NB: It is legal to use `add` to create a `SourcePosition` which is *on* the end of `Source`,
/// just not past it.
#[inline]
pub(super) unsafe fn add(self, n: usize) -> Self {
Self::new(self.ptr.add(n))
}
/// Create new `SourcePosition` which is `n` bytes before this one.
/// The provenance of the pointer `SourcePosition` contains is maintained.
///
/// # SAFETY
/// Caller must ensure that reversing `SourcePosition` by `n` bytes does not make it before the start
/// of `Source` this `SourcePosition` was created from.
#[inline]
pub(super) unsafe fn sub(self, n: usize) -> Self {
Self::new(self.ptr.sub(n))
}
/// Read byte from this `SourcePosition`.
///
/// # SAFETY
/// Caller must ensure `SourcePosition` is not at end of source text.
///
/// # Implementation details
///
/// Using `as_ref()` for reading is copied from `core::slice::iter::next`.
/// https://doc.rust-lang.org/src/core/slice/iter.rs.html#132
/// https://doc.rust-lang.org/src/core/slice/iter/macros.rs.html#156-168
///
/// Using `ptr.as_ref().unwrap_unchecked()` instead of `*ptr` or `ptr.read()` produces
/// a 7% speed-up on Lexer benchmarks.
/// Presumably this is because it tells the compiler it can rely on the memory being immutable,
/// because if a `&mut` reference existed, that would violate Rust's aliasing rules.
#[inline]
pub(super) unsafe fn read(self) -> u8 {
// SAFETY:
// Caller guarantees `self` is not at end of source text.
// `Source` is created from a valid `&str`, so points to allocated, initialized memory.
// `Source` conceptually holds the source text `&str`, which guarantees no mutable references
// to the same memory can exist, as that would violate Rust's aliasing rules.
// Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects.
// Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements).
debug_assert!(!self.ptr.is_null());
*self.ptr.as_ref().unwrap_unchecked()
}
/// Read 2 bytes from this `SourcePosition`.
///
/// # SAFETY
/// Caller must ensure `SourcePosition` is no later than 2 bytes before end of source text.
/// i.e. if source length is 10, `self` must be on position 8 max.
#[inline]
pub(super) unsafe fn read2(self) -> [u8; 2] {
// SAFETY:
// Caller guarantees `self` is not at no later than 2 bytes before end of source text.
// `Source` is created from a valid `&str`, so points to allocated, initialized memory.
// `Source` conceptually holds the source text `&str`, which guarantees no mutable references
// to the same memory can exist, as that would violate Rust's aliasing rules.
// Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects.
// Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements).
debug_assert!(!self.ptr.is_null());
#[expect(clippy::ptr_as_ptr)]
let p = self.ptr as *const [u8; 2];
*p.as_ref().unwrap_unchecked()
}
}
/// Return if byte is a UTF-8 continuation byte.
#[inline]
const fn is_utf8_cont_byte(byte: u8) -> bool {
// 0x80 - 0xBF are continuation bytes i.e. not 1st byte of a UTF-8 character sequence
byte >= 0x80 && byte < 0xC0
}