Skip to content

Commit 716a631

Browse files
committed
Remove part of not_quite_std.rs, use new core::**_raw functions instead.
See rust-lang/rust#21488
1 parent b4ed64d commit 716a631

File tree

2 files changed

+17
-170
lines changed

2 files changed

+17
-170
lines changed

src/lib.rs

+14-2
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@ WTF-8 strings can be obtained from UTF-8, UTF-16, or code points.
1717

1818
#![allow(unstable)]
1919

20+
extern crate core;
2021
extern crate unicode;
2122

2223

24+
use core::str::{next_code_point, char_range_at_raw};
2325
use std::str;
2426
use std::string::CowString;
2527
use std::borrow::Cow;
@@ -542,7 +544,8 @@ impl Wtf8 {
542544
/// or is beyond the end of the string.
543545
#[inline]
544546
pub fn code_point_range_at(&self, position: usize) -> (CodePoint, usize) {
545-
not_quite_std::code_point_range_at(self, position)
547+
let (c, n) = char_range_at_raw(&self.bytes, position);
548+
(unsafe { CodePoint::from_u32_unchecked(c) }, n)
546549
}
547550

548551
/// Return an iterator for the string’s code points.
@@ -693,7 +696,16 @@ impl<'a> Iterator for Wtf8CodePoints<'a> {
693696

694697
#[inline]
695698
fn next(&mut self) -> Option<CodePoint> {
696-
not_quite_std::next_code_point(&mut self.bytes)
699+
match next_code_point(&mut self.bytes) {
700+
None => None,
701+
Some(value) => {
702+
// Wtf8 invariant says `value` is a valid code point
703+
unsafe {
704+
Some(CodePoint::from_u32_unchecked(value))
705+
}
706+
}
707+
}
708+
697709
}
698710

699711
#[inline]

src/not_quite_std.rs

+3-168
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
//! try to avoid the code duplication.
99
//! Maybe by having private generic code that is monomorphized to UTF-8 and WTF-8?
1010
11+
use core::char::{encode_utf8_raw, encode_utf16_raw};
1112
use std::mem;
1213
use std::raw::Slice as RawSlice;
13-
use std::slice;
1414
use super::{Wtf8Buf, Wtf8, CodePoint, IllFormedUtf16CodeUnits};
1515

1616

@@ -29,51 +29,12 @@ pub fn push_code_point(string: &mut Wtf8Buf, code_point: CodePoint) {
2929
data: string.bytes.as_ptr().offset(cur_len as isize),
3030
len: 4,
3131
};
32-
let used = encode_wtf8(code_point, mem::transmute(slice)).unwrap_or(0);
32+
let used = encode_utf8_raw(code_point.to_u32(), mem::transmute(slice)).unwrap_or(0);
3333
string.bytes.set_len(cur_len + used);
3434
}
3535
}
3636

3737

38-
/// Copied from core::char::Char::encode_utf8
39-
#[inline]
40-
pub fn encode_wtf8(code_point: CodePoint, dst: &mut [u8]) -> Option<usize> {
41-
// Marked #[inline] to allow llvm optimizing it away
42-
let code = code_point.value;
43-
if code < MAX_ONE_B && dst.len() >= 1 {
44-
dst[0] = code as u8;
45-
Some(1)
46-
} else if code < MAX_TWO_B && dst.len() >= 2 {
47-
dst[0] = (code >> 6 & 0x1F_u32) as u8 | TAG_TWO_B;
48-
dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
49-
Some(2)
50-
} else if code < MAX_THREE_B && dst.len() >= 3 {
51-
dst[0] = (code >> 12 & 0x0F_u32) as u8 | TAG_THREE_B;
52-
dst[1] = (code >> 6 & 0x3F_u32) as u8 | TAG_CONT;
53-
dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
54-
Some(3)
55-
} else if dst.len() >= 4 {
56-
dst[0] = (code >> 18 & 0x07_u32) as u8 | TAG_FOUR_B;
57-
dst[1] = (code >> 12 & 0x3F_u32) as u8 | TAG_CONT;
58-
dst[2] = (code >> 6 & 0x3F_u32) as u8 | TAG_CONT;
59-
dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
60-
Some(4)
61-
} else {
62-
None
63-
}
64-
}
65-
66-
// Copied from core::char
67-
// UTF-8 ranges and tags for encoding characters
68-
static TAG_CONT: u8 = 0b1000_0000u8;
69-
static TAG_TWO_B: u8 = 0b1100_0000u8;
70-
static TAG_THREE_B: u8 = 0b1110_0000u8;
71-
static TAG_FOUR_B: u8 = 0b1111_0000u8;
72-
static MAX_ONE_B: u32 = 0x80u32;
73-
static MAX_TWO_B: u32 = 0x800u32;
74-
static MAX_THREE_B: u32 = 0x10000u32;
75-
76-
7738
/// Copied from core::str::StrPrelude::is_char_boundary
7839
#[inline]
7940
pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool {
@@ -101,112 +62,6 @@ pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
10162
begin, end, s);
10263
}
10364

104-
// Return the initial codepoint accumulator for the first byte.
105-
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
106-
// for width 3, and 3 bits for width 4
107-
macro_rules! utf8_first_byte(
108-
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
109-
);
110-
111-
// return the value of $ch updated with continuation byte $byte
112-
macro_rules! utf8_acc_cont_byte(
113-
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & CONT_MASK) as u32)
114-
);
115-
116-
/// Copied from core::str::StrPrelude::char_range_at
117-
#[inline]
118-
pub fn code_point_range_at(slice: &Wtf8, i: usize) -> (CodePoint, usize) {
119-
if slice.bytes[i] < 128u8 {
120-
return (CodePoint::from_char(slice.bytes[i] as char), i + 1);
121-
}
122-
123-
// Multibyte case is a fn to allow code_point_range_at to inline cleanly
124-
fn multibyte_code_point_range_at(s: &Wtf8, i: usize) -> (CodePoint, usize) {
125-
let mut val = s.bytes[i] as u32;
126-
let w = UTF8_CHAR_WIDTH[val as usize] as usize;
127-
assert!((w != 0));
128-
129-
val = utf8_first_byte!(val, w);
130-
val = utf8_acc_cont_byte!(val, s.bytes[i + 1]);
131-
if w > 2 { val = utf8_acc_cont_byte!(val, s.bytes[i + 2]); }
132-
if w > 3 { val = utf8_acc_cont_byte!(val, s.bytes[i + 3]); }
133-
134-
return (unsafe { CodePoint::from_u32_unchecked(val) }, i + w);
135-
}
136-
137-
return multibyte_code_point_range_at(slice, i);
138-
}
139-
140-
141-
// Copied from core::str
142-
/// Mask of the value bits of a continuation byte
143-
const CONT_MASK: u8 = 0b0011_1111u8;
144-
145-
// Copied from core::str
146-
// https://tools.ietf.org/html/rfc3629
147-
static UTF8_CHAR_WIDTH: [u8; 256] = [
148-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
149-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
150-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
151-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
152-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
153-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
154-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
155-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
156-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
157-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
158-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
159-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
160-
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
161-
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
162-
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
163-
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
164-
];
165-
166-
167-
/// Copied from core::str::Chars::next
168-
pub fn next_code_point(bytes: &mut slice::Iter<u8>) -> Option<CodePoint> {
169-
#[inline]
170-
fn unwrap_or_0(opt: Option<&u8>) -> u8 {
171-
match opt {
172-
Some(&byte) => byte,
173-
None => 0,
174-
}
175-
}
176-
177-
// Decode UTF-8, using the valid UTF-8 invariant
178-
let x = match bytes.next() {
179-
None => return None,
180-
Some(&next_byte) if next_byte < 128 => return Some(CodePoint::from_char(next_byte as char)),
181-
Some(&next_byte) => next_byte,
182-
};
183-
184-
// Multibyte case follows
185-
// Decode from a byte combination out of: [[[x y] z] w]
186-
// NOTE: Performance is sensitive to the exact formulation here
187-
let init = utf8_first_byte!(x, 2);
188-
let y = unwrap_or_0(bytes.next());
189-
let mut ch = utf8_acc_cont_byte!(init, y);
190-
if x >= 0xE0 {
191-
// [[x y z] w] case
192-
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
193-
let z = unwrap_or_0(bytes.next());
194-
let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z);
195-
ch = init << 12 | y_z;
196-
if x >= 0xF0 {
197-
// [x y z w] case
198-
// use only the lower 3 bits of `init`
199-
let w = unwrap_or_0(bytes.next());
200-
ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w);
201-
}
202-
}
203-
204-
// str invariant says `ch` is a valid Unicode Scalar Value
205-
unsafe {
206-
Some(CodePoint::from_u32_unchecked(ch))
207-
}
208-
}
209-
21065

21166
/// Copied from core::str::Utf16CodeUnits::next
21267
pub fn next_utf16_code_unit(iter: &mut IllFormedUtf16CodeUnits) -> Option<u16> {
@@ -218,28 +73,8 @@ pub fn next_utf16_code_unit(iter: &mut IllFormedUtf16CodeUnits) -> Option<u16> {
21873

21974
let mut buf = [0u16; 2];
22075
iter.code_points.next().map(|code_point| {
221-
let n = encode_utf16(code_point, buf.as_mut_slice()).unwrap_or(0);
76+
let n = encode_utf16_raw(code_point.to_u32(), buf.as_mut_slice()).unwrap_or(0);
22277
if n == 2 { iter.extra = buf[1]; }
22378
buf[0]
22479
})
22580
}
226-
227-
/// Copied from core::char::Char::encode_utf16
228-
#[inline]
229-
fn encode_utf16(code_point: CodePoint, dst: &mut [u16]) -> Option<usize> {
230-
// Marked #[inline] to allow llvm optimizing it away
231-
let mut ch = code_point.to_u32();
232-
if (ch & 0xFFFF_u32) == ch && dst.len() >= 1 {
233-
// The BMP falls through (assuming non-surrogate, as it should)
234-
dst[0] = ch as u16;
235-
Some(1)
236-
} else if dst.len() >= 2 {
237-
// Supplementary planes break into surrogates.
238-
ch -= 0x1_0000_u32;
239-
dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
240-
dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
241-
Some(2)
242-
} else {
243-
None
244-
}
245-
}

0 commit comments

Comments
 (0)