-
Notifications
You must be signed in to change notification settings - Fork 3.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
UTF8 encoding/decoding library to deprecated utf8 functions in string…
…s.h|cc (#36184) library. PiperOrigin-RevId: 398102411 Co-authored-by: Amaltas Bohra <amaltas@google.com>
- Loading branch information
Showing
2 changed files
with
337 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
#ifndef HTMLPARSER__UTF8_H_ | ||
#define HTMLPARSER__UTF8_H_ | ||
|
||
#include <array> | ||
|
||
namespace htmlparser { | ||
|
||
// Is this an ascii character, that is byte is code-point in itself. | ||
// (0..0x7f). | ||
#define IS_ASCII(c) (c & 0x80) == 0 | ||
|
||
// Is this ascii char and a digit. | ||
#define IS_DIGIT(c) (static_cast<uint8_t>(c - 0x30) < 0xa) | ||
|
||
#define IS_ALPHABET(c) \ | ||
((static_cast<uint8_t>(c - 0x41) < 0x1a) || \ | ||
(static_cast<uint8_t>(c - 0x61) < 0x1a)) | ||
|
||
// Is this code point a Unicode non-character | ||
#define IS_CODEPOINT_NONCHAR(c) \ | ||
((c) >= 0xfdd0 && \ | ||
((c) <= 0fdef || ((c) & 0xfffe) == 0xfffe) && (c) <= 0x10ffff) | ||
|
||
#define IS_CODEPOINT_CHAR(c) \ | ||
(static_cast<uint32_t>(c) < 0Xd800 || \ | ||
(0Xdfff < c && c <= 0x10ffff && !IS_UNICODE_NONCHAR(c))) | ||
|
||
// Counts number of continuation bytes for this codepoint. | ||
#define NUM_TRAIL_BYTES(c) \ | ||
(IS_LEAD_BYTE(c) ? \ | ||
((static_cast<uint8_t>(c) >= 0xe0) + \ | ||
(static_cast<uint8_t>(c) >= 0xf0) + 1) \ | ||
: 0) | ||
|
||
#define LEAD_BYTE_TRAIL_COUNT(c) \ | ||
((static_cast<uint8_t>(c) >= 0xc2) + \ | ||
(static_cast<uint8_t>(c) >= 0xe0) + \ | ||
(static_cast<uint8_t>(c) >= 0xf0)) | ||
|
||
#define CODE_POINT_NUM_BYTES(c) \ | ||
(static_cast<uint32_t>(c) <= 0x7f ? 1 : \ | ||
(static_cast<uint32_t>(c) <= 0x7ff ? 2 : \ | ||
(static_cast<uint32_t>(c) <= 0xd7ff ? 3 : \ | ||
(static_cast<uint32_t>(c) <= 0xdfff || \ | ||
static_cast<uint32_t>(c) > 0x10ffff ? 0 : \ | ||
(static_cast<uint32_t>(c) <= 0xffff ? 3 : 4) \ | ||
) \ | ||
) \ | ||
) \ | ||
) | ||
|
||
#define READ_TRAIL_BYTE(c) \ | ||
(((static_cast<uint8_t>(c) & 0xc0) == 0x80) ? \ | ||
(c) & 0x3f : 0) | ||
|
||
// Valid utf-8 byte sequences and their validity macros. | ||
// Ref: Table 3.7 in https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf | ||
// +-------------------+------------+-------------+------------+-------------+ | ||
// | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte | | ||
// +-------------------+------------+-------------+------------+-------------+ | ||
// | U+0000..U+007F | 00..7F | | | | | ||
// +-------------------+------------+-------------+------------+-------------+ | ||
// | U+0080..U+07FF | C2..DF | 80..BF | | | | ||
// +-------------------+------------+-------------+------------+-------------+ | ||
// | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | | ||
// +-------------------+------------+-------------+------------+-------------+ | ||
// | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | | ||
// +-------------------+------------+-------------+------------+-------------+ | ||
// | U+D000..U+D7FF | ED | 80..9F | 80..BF | | | ||
// +-------------------+------------+-------------+------------+-------------+ | ||
// | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | | ||
// +-------------------+------------+-------------+------------+-------------+ | ||
// | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | | ||
// +-------------------+------------+-------------+------------+-------------+ | ||
// | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | | ||
// +-------------------+------------+-------------+------------+-------------+ | ||
// | U+100000..U+10FFFF| F4 | 80..8F | 80..BF | 80..BF | | ||
// +-------------------+------------+-------------+------------+-------------+ | ||
|
||
static constexpr std::array<uint8_t, 16> k3ByteTrailByteValidity { | ||
0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, | ||
0x10, 0x30, 0x30 | ||
}; | ||
|
||
static constexpr std::array<uint8_t, 16> k4ByteTrailByteValidity { | ||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x0F, 0x0F, 0x0F, 0x00, | ||
0x00, 0x00, 0x00 | ||
}; | ||
|
||
// Is utf-8 lead byte (0xc2..0xf4). | ||
#define IS_UTF8_LEAD_BYTE(c) (static_cast<uint8_t>(c - 0xc2) <= 0x32) | ||
|
||
// Is utf-8 trail byte (0x80..0xBF). | ||
#define IS_UTF8_TRAIL_BYTE(c) (static_cast<int8_t>(c) < -0x40) | ||
|
||
// Is utf-8 trail second byte valid. | ||
#define IS_UTF8_TRAIL_2ND_BYTE_VALID(lead_byte, trail_byte) \ | ||
lead_byte < 0xe0 ? \ | ||
IS_UTF8_TRAIL_BYTE(trail_byte) : \ | ||
htmlparser::k3ByteTrailByteValidity[lead_byte & 0xf] & \ | ||
(1 << (static_cast<uint8_t>(trail_byte) >> 5)) | ||
|
||
#define IS_UTF8_TRAIL_3RD_BYTE_VALID(lead_byte, trail_byte) \ | ||
lead_byte >= 0xf0 ? \ | ||
htmlparser::k4ByteTrailByteValidity[static_cast<uint8_t>(trail_byte) >> 4] & \ | ||
(1 << (lead_byte & 7)) : 0 | ||
|
||
#define _DECODE_UTF8_2(c1, c2) \ | ||
((c1 & 0b11111) << 6) | READ_TRAIL_BYTE(c2) | ||
|
||
#define _DECODE_UTF8_3(c1, c2, c3) \ | ||
((c1 & 0b1111) << 12) | (c2 << 6) | c3 | ||
|
||
#define _DECODE_UTF8_4(c1, c2, c3, c4) \ | ||
((c1 & 0b111) << 18) | \ | ||
(READ_TRAIL_BYTE(c2) << 12) | \ | ||
(READ_TRAIL_BYTE(c3) << 6) | \ | ||
READ_TRAIL_BYTE(c4) | ||
|
||
#define _DECODE_UTF8_X(x, A, B, C, D, FUNC, ...) FUNC | ||
|
||
#define TO_CODEPOINT(...) \ | ||
_DECODE_UTF8_X(, ##__VA_ARGS__, \ | ||
_DECODE_UTF8_4(__VA_ARGS__), \ | ||
_DECODE_UTF8_3(__VA_ARGS__), \ | ||
_DECODE_UTF8_2(__VA_ARGS__)) | ||
|
||
// (U+d800..U+dfff). | ||
#define IS_SURROGATE(c) (((c) & 0xfffff800) == 0xd800) | ||
|
||
} // namespace htmlparser | ||
|
||
|
||
#endif // HTMLPARSER__UTF8_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
#include "utf8.h" | ||
|
||
#include "gtest/gtest.h" | ||
|
||
TEST(UTF8Test, AsciiCharsTest) { | ||
for (uint8_t i = 'a'; i <= 'z'; ++i) { | ||
EXPECT_TRUE(IS_ASCII(i)); | ||
EXPECT_FALSE(IS_DIGIT(i)); | ||
EXPECT_TRUE(IS_ALPHABET(i)); | ||
} | ||
for (uint8_t i = 'A'; i <= 'Z'; ++i) { | ||
EXPECT_TRUE(IS_ASCII(i)); | ||
EXPECT_FALSE(IS_DIGIT(i)); | ||
EXPECT_TRUE(IS_ALPHABET(i)); | ||
} | ||
for (uint8_t i = '0'; i <= '9'; ++i) { | ||
EXPECT_TRUE(IS_ASCII(i)); | ||
EXPECT_TRUE(IS_DIGIT(i)); | ||
EXPECT_FALSE(IS_ALPHABET(i)); | ||
} | ||
} | ||
|
||
TEST(UTF8Test, DecodeUtf8SymbolTest) { | ||
EXPECT_EQ(TO_CODEPOINT(0xf0, 0x9d, 0x8c, 0x86), 119558); | ||
EXPECT_EQ(TO_CODEPOINT(0xe2, 0x8c, 0x98), 9112); | ||
EXPECT_EQ(TO_CODEPOINT(0xc5, 0x9a), 346); | ||
} | ||
|
||
TEST(UTF8Test, ReadContinuationByteTest) { | ||
// First two bits 010... not a continuation byte. | ||
EXPECT_EQ(0, READ_TRAIL_BYTE(0b11000001)); | ||
// Mask first two valid continuation bits. | ||
EXPECT_EQ(0b00111111, READ_TRAIL_BYTE(0b10111111)); | ||
} | ||
|
||
TEST(UTF8Test, IsTrailByteTest) { | ||
EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x9d)); | ||
EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x8c)); | ||
EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x86)); | ||
EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x98)); | ||
EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x9a)); | ||
EXPECT_FALSE(IS_UTF8_TRAIL_BYTE(0xf0)); | ||
EXPECT_FALSE(IS_UTF8_TRAIL_BYTE(0xe2)); | ||
EXPECT_FALSE(IS_UTF8_TRAIL_BYTE(0xc5)); | ||
} | ||
|
||
TEST(UTF8Test, IsLeadingByteTest) { | ||
// Invalid bytes. | ||
for (uint8_t i = 0; i < 194; ++i) { | ||
EXPECT_FALSE(IS_UTF8_LEAD_BYTE(i)); | ||
} | ||
|
||
// Valid 0xc2..0xf4. | ||
for (uint8_t i = 0xc2; i < 0xf5; ++i) { | ||
EXPECT_TRUE(IS_UTF8_LEAD_BYTE(i)); | ||
} | ||
|
||
// Invalid bytes 0xf5 to 0xff. | ||
for (uint8_t i = 245; i <= 254; ++i) { | ||
EXPECT_FALSE(IS_UTF8_LEAD_BYTE(i)); | ||
} | ||
|
||
EXPECT_FALSE(IS_UTF8_LEAD_BYTE(255)); | ||
} | ||
|
||
TEST(UTF8Test, CodePointByteSequenceCountTest) { | ||
EXPECT_EQ(0, LEAD_BYTE_TRAIL_COUNT('a')); | ||
EXPECT_EQ(1, LEAD_BYTE_TRAIL_COUNT(0xc5)); | ||
EXPECT_EQ(2, LEAD_BYTE_TRAIL_COUNT(0xe2)); | ||
EXPECT_EQ(3, LEAD_BYTE_TRAIL_COUNT(0xf0)); | ||
|
||
// c2..df, 1 subsequent byte. | ||
for (uint8_t i = 0xc2; i <= 0xdf; ++i) { | ||
EXPECT_EQ(1, LEAD_BYTE_TRAIL_COUNT(i)); | ||
} | ||
|
||
// e0..ef, 2 subsequent bytes. | ||
for (uint8_t i = 0xe0; i <= 0xef; ++i) { | ||
EXPECT_EQ(2, LEAD_BYTE_TRAIL_COUNT(i)); | ||
} | ||
|
||
// f0..f4, 3 subsequent bytes. | ||
for (uint8_t i = 0xf0; i <= 0xf4; ++i) { | ||
EXPECT_EQ(3, LEAD_BYTE_TRAIL_COUNT(i)); | ||
} | ||
} | ||
|
||
TEST(UTF8Test, CodePointNumBytesTest) { | ||
EXPECT_EQ(1, CODE_POINT_NUM_BYTES('a')); | ||
EXPECT_EQ(2, CODE_POINT_NUM_BYTES(346 /*"Ś"*/)); | ||
EXPECT_EQ(3, CODE_POINT_NUM_BYTES(9112)); | ||
EXPECT_EQ(4, CODE_POINT_NUM_BYTES(119558 /*"𝌆"*/)); | ||
} | ||
|
||
TEST(UTF8Test, 3ByteSequenceValidityTest) { | ||
// Lead byte 0xc2..0xdf. | ||
for (uint8_t i = 0xc2; i <= 0xdf; ++i) { | ||
// Not a trail byte. | ||
for (uint8_t j = 0; j < 0x80; ++j) { | ||
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); | ||
} | ||
for (uint8_t j = 0xbf + 1; j < 0xff; ++j) { | ||
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); | ||
} | ||
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff)); | ||
|
||
// Valid range. | ||
for (uint8_t j = 0x80; j <= 0xbf; ++j) { | ||
EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); | ||
} | ||
} | ||
|
||
// Lead byte 0xe0. | ||
// Not a trail byte. | ||
uint8_t i = 0xe0; | ||
for (uint8_t j = 0; j < 0xa0; ++j) { | ||
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); | ||
} | ||
for (uint8_t j = 0xa0; j <= 0xbf; ++j) { | ||
EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); | ||
} | ||
// Bytes > 0xbf is invalid. | ||
for (uint8_t j = 0xbf + 1; j < 0xff; ++j) { | ||
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); | ||
} | ||
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff)); | ||
|
||
// Lead byte 0xe1..0xec. | ||
for (uint8_t i = 0xe1; i <= 0xec; ++i) { | ||
// Not a trail byte. | ||
for (uint8_t j = 0; j < 0x80; ++j) { | ||
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); | ||
} | ||
for (uint8_t j = 0xbf + 1; j < 0xff; ++j) { | ||
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); | ||
} | ||
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff)); | ||
|
||
// Valid range. | ||
for (uint8_t j = 0x80; j <= 0xbf; ++j) { | ||
EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); | ||
} | ||
} | ||
|
||
// Lead byte 0xed. | ||
i = 0xed; | ||
// Not a trail byte. | ||
for (uint8_t j = 0; j < 0x80; ++j) { | ||
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); | ||
} | ||
for (uint8_t j = 0xbf + 1; j < 0xff; ++j) { | ||
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); | ||
} | ||
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff)); | ||
|
||
// Valid range. | ||
for (uint8_t j = 0x80; j <= 0x9f; ++j) { | ||
EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); | ||
} | ||
// Invalid range. 0xa0..0xbf. | ||
for (uint8_t j = 0xa0; j <= 0xbf; ++j) { | ||
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); | ||
} | ||
|
||
// Lead byte 0xee..0xef. | ||
for (uint8_t i = 0xee; i <= 0xef; ++i) { | ||
// Not a trail byte. | ||
for (uint8_t j = 0; j < 0x80; ++j) { | ||
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); | ||
} | ||
for (uint8_t j = 0xbf + 1; j < 0xff; ++j) { | ||
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); | ||
} | ||
EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff)); | ||
|
||
// Valid range. | ||
for (uint8_t j = 0x80; j <= 0xbf; ++j) { | ||
EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); | ||
} | ||
} | ||
} | ||
|
||
TEST(UTF8Test, 4ByteSequenceValidityTest) { | ||
// Not a 4 byte sequence lead byte. | ||
for (uint8_t i = 0; i < 0xf0; ++i) { | ||
for (uint8_t j = 0; j < 0xff; ++j) { | ||
EXPECT_FALSE(IS_UTF8_TRAIL_3RD_BYTE_VALID(i, j)); | ||
} | ||
} | ||
|
||
// 4byte lead byte 0xf1..0xf3 | ||
for (uint8_t i = 0xf1; i <= 0xf3; ++i) { | ||
// Invalid trail byte. | ||
for (uint8_t j = 0; j < 0x80; ++j) { | ||
EXPECT_FALSE(IS_UTF8_TRAIL_3RD_BYTE_VALID(i, j)); | ||
} | ||
// Valid trail byte. | ||
for (uint8_t j = 0x80; j <= 0xbf; ++j) { | ||
EXPECT_TRUE(IS_UTF8_TRAIL_3RD_BYTE_VALID(i, j)); | ||
} | ||
} | ||
} | ||
|