8
8
//! try to avoid the code duplication.
9
9
//! Maybe by having private generic code that is monomorphized to UTF-8 and WTF-8?
10
10
11
+ use core:: char:: { encode_utf8_raw, encode_utf16_raw} ;
11
12
use std:: mem;
12
13
use std:: raw:: Slice as RawSlice ;
13
- use std:: slice;
14
14
use super :: { Wtf8Buf , Wtf8 , CodePoint , IllFormedUtf16CodeUnits } ;
15
15
16
16
@@ -29,51 +29,12 @@ pub fn push_code_point(string: &mut Wtf8Buf, code_point: CodePoint) {
29
29
data : string. bytes . as_ptr ( ) . offset ( cur_len as isize ) ,
30
30
len : 4 ,
31
31
} ;
32
- let used = encode_wtf8 ( code_point, mem:: transmute ( slice) ) . unwrap_or ( 0 ) ;
32
+ let used = encode_utf8_raw ( code_point. to_u32 ( ) , mem:: transmute ( slice) ) . unwrap_or ( 0 ) ;
33
33
string. bytes . set_len ( cur_len + used) ;
34
34
}
35
35
}
36
36
37
37
38
- /// Copied from core::char::Char::encode_utf8
39
- #[ inline]
40
- pub fn encode_wtf8 ( code_point : CodePoint , dst : & mut [ u8 ] ) -> Option < usize > {
41
- // Marked #[inline] to allow llvm optimizing it away
42
- let code = code_point. value ;
43
- if code < MAX_ONE_B && dst. len ( ) >= 1 {
44
- dst[ 0 ] = code as u8 ;
45
- Some ( 1 )
46
- } else if code < MAX_TWO_B && dst. len ( ) >= 2 {
47
- dst[ 0 ] = ( code >> 6 & 0x1F_u32 ) as u8 | TAG_TWO_B ;
48
- dst[ 1 ] = ( code & 0x3F_u32 ) as u8 | TAG_CONT ;
49
- Some ( 2 )
50
- } else if code < MAX_THREE_B && dst. len ( ) >= 3 {
51
- dst[ 0 ] = ( code >> 12 & 0x0F_u32 ) as u8 | TAG_THREE_B ;
52
- dst[ 1 ] = ( code >> 6 & 0x3F_u32 ) as u8 | TAG_CONT ;
53
- dst[ 2 ] = ( code & 0x3F_u32 ) as u8 | TAG_CONT ;
54
- Some ( 3 )
55
- } else if dst. len ( ) >= 4 {
56
- dst[ 0 ] = ( code >> 18 & 0x07_u32 ) as u8 | TAG_FOUR_B ;
57
- dst[ 1 ] = ( code >> 12 & 0x3F_u32 ) as u8 | TAG_CONT ;
58
- dst[ 2 ] = ( code >> 6 & 0x3F_u32 ) as u8 | TAG_CONT ;
59
- dst[ 3 ] = ( code & 0x3F_u32 ) as u8 | TAG_CONT ;
60
- Some ( 4 )
61
- } else {
62
- None
63
- }
64
- }
65
-
66
- // Copied from core::char
67
- // UTF-8 ranges and tags for encoding characters
68
- static TAG_CONT : u8 = 0b1000_0000u8 ;
69
- static TAG_TWO_B : u8 = 0b1100_0000u8 ;
70
- static TAG_THREE_B : u8 = 0b1110_0000u8 ;
71
- static TAG_FOUR_B : u8 = 0b1111_0000u8 ;
72
- static MAX_ONE_B : u32 = 0x80u32 ;
73
- static MAX_TWO_B : u32 = 0x800u32 ;
74
- static MAX_THREE_B : u32 = 0x10000u32 ;
75
-
76
-
77
38
/// Copied from core::str::StrPrelude::is_char_boundary
78
39
#[ inline]
79
40
pub fn is_code_point_boundary ( slice : & Wtf8 , index : usize ) -> bool {
@@ -101,112 +62,6 @@ pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
101
62
begin, end, s) ;
102
63
}
103
64
104
- // Return the initial codepoint accumulator for the first byte.
105
- // The first byte is special, only want bottom 5 bits for width 2, 4 bits
106
- // for width 3, and 3 bits for width 4
107
- macro_rules! utf8_first_byte(
108
- ( $byte: expr, $width: expr) => ( ( $byte & ( 0x7F >> $width) ) as u32 )
109
- ) ;
110
-
111
- // return the value of $ch updated with continuation byte $byte
112
- macro_rules! utf8_acc_cont_byte(
113
- ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & CONT_MASK ) as u32 )
114
- ) ;
115
-
116
- /// Copied from core::str::StrPrelude::char_range_at
117
- #[ inline]
118
- pub fn code_point_range_at ( slice : & Wtf8 , i : usize ) -> ( CodePoint , usize ) {
119
- if slice. bytes [ i] < 128u8 {
120
- return ( CodePoint :: from_char ( slice. bytes [ i] as char ) , i + 1 ) ;
121
- }
122
-
123
- // Multibyte case is a fn to allow code_point_range_at to inline cleanly
124
- fn multibyte_code_point_range_at ( s : & Wtf8 , i : usize ) -> ( CodePoint , usize ) {
125
- let mut val = s. bytes [ i] as u32 ;
126
- let w = UTF8_CHAR_WIDTH [ val as usize ] as usize ;
127
- assert ! ( ( w != 0 ) ) ;
128
-
129
- val = utf8_first_byte ! ( val, w) ;
130
- val = utf8_acc_cont_byte ! ( val, s. bytes[ i + 1 ] ) ;
131
- if w > 2 { val = utf8_acc_cont_byte ! ( val, s. bytes[ i + 2 ] ) ; }
132
- if w > 3 { val = utf8_acc_cont_byte ! ( val, s. bytes[ i + 3 ] ) ; }
133
-
134
- return ( unsafe { CodePoint :: from_u32_unchecked ( val) } , i + w) ;
135
- }
136
-
137
- return multibyte_code_point_range_at ( slice, i) ;
138
- }
139
-
140
-
141
- // Copied from core::str
142
- /// Mask of the value bits of a continuation byte
143
- const CONT_MASK : u8 = 0b0011_1111u8 ;
144
-
145
- // Copied from core::str
146
- // https://tools.ietf.org/html/rfc3629
147
- static UTF8_CHAR_WIDTH : [ u8 ; 256 ] = [
148
- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
149
- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 0x1F
150
- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
151
- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 0x3F
152
- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
153
- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 0x5F
154
- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
155
- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 0x7F
156
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
157
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0x9F
158
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
159
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0xBF
160
- 0 , 0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
161
- 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , // 0xDF
162
- 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , // 0xEF
163
- 4 , 4 , 4 , 4 , 4 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0xFF
164
- ] ;
165
-
166
-
167
- /// Copied from core::str::Chars::next
168
- pub fn next_code_point ( bytes : & mut slice:: Iter < u8 > ) -> Option < CodePoint > {
169
- #[ inline]
170
- fn unwrap_or_0 ( opt : Option < & u8 > ) -> u8 {
171
- match opt {
172
- Some ( & byte) => byte,
173
- None => 0 ,
174
- }
175
- }
176
-
177
- // Decode UTF-8, using the valid UTF-8 invariant
178
- let x = match bytes. next ( ) {
179
- None => return None ,
180
- Some ( & next_byte) if next_byte < 128 => return Some ( CodePoint :: from_char ( next_byte as char ) ) ,
181
- Some ( & next_byte) => next_byte,
182
- } ;
183
-
184
- // Multibyte case follows
185
- // Decode from a byte combination out of: [[[x y] z] w]
186
- // NOTE: Performance is sensitive to the exact formulation here
187
- let init = utf8_first_byte ! ( x, 2 ) ;
188
- let y = unwrap_or_0 ( bytes. next ( ) ) ;
189
- let mut ch = utf8_acc_cont_byte ! ( init, y) ;
190
- if x >= 0xE0 {
191
- // [[x y z] w] case
192
- // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
193
- let z = unwrap_or_0 ( bytes. next ( ) ) ;
194
- let y_z = utf8_acc_cont_byte ! ( ( y & CONT_MASK ) as u32 , z) ;
195
- ch = init << 12 | y_z;
196
- if x >= 0xF0 {
197
- // [x y z w] case
198
- // use only the lower 3 bits of `init`
199
- let w = unwrap_or_0 ( bytes. next ( ) ) ;
200
- ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ! ( y_z, w) ;
201
- }
202
- }
203
-
204
- // str invariant says `ch` is a valid Unicode Scalar Value
205
- unsafe {
206
- Some ( CodePoint :: from_u32_unchecked ( ch) )
207
- }
208
- }
209
-
210
65
211
66
/// Copied from core::str::Utf16CodeUnits::next
212
67
pub fn next_utf16_code_unit ( iter : & mut IllFormedUtf16CodeUnits ) -> Option < u16 > {
@@ -218,28 +73,8 @@ pub fn next_utf16_code_unit(iter: &mut IllFormedUtf16CodeUnits) -> Option<u16> {
218
73
219
74
let mut buf = [ 0u16 ; 2 ] ;
220
75
iter. code_points . next ( ) . map ( |code_point| {
221
- let n = encode_utf16 ( code_point, buf. as_mut_slice ( ) ) . unwrap_or ( 0 ) ;
76
+ let n = encode_utf16_raw ( code_point. to_u32 ( ) , buf. as_mut_slice ( ) ) . unwrap_or ( 0 ) ;
222
77
if n == 2 { iter. extra = buf[ 1 ] ; }
223
78
buf[ 0 ]
224
79
} )
225
80
}
226
-
227
- /// Copied from core::char::Char::encode_utf16
228
- #[ inline]
229
- fn encode_utf16 ( code_point : CodePoint , dst : & mut [ u16 ] ) -> Option < usize > {
230
- // Marked #[inline] to allow llvm optimizing it away
231
- let mut ch = code_point. to_u32 ( ) ;
232
- if ( ch & 0xFFFF_u32 ) == ch && dst. len ( ) >= 1 {
233
- // The BMP falls through (assuming non-surrogate, as it should)
234
- dst[ 0 ] = ch as u16 ;
235
- Some ( 1 )
236
- } else if dst. len ( ) >= 2 {
237
- // Supplementary planes break into surrogates.
238
- ch -= 0x1_0000_u32 ;
239
- dst[ 0 ] = 0xD800_u16 | ( ( ch >> 10 ) as u16 ) ;
240
- dst[ 1 ] = 0xDC00_u16 | ( ( ch as u16 ) & 0x3FF_u16 ) ;
241
- Some ( 2 )
242
- } else {
243
- None
244
- }
245
- }
0 commit comments