diff --git a/validator/cpp/htmlparser/utf8.h b/validator/cpp/htmlparser/utf8.h new file mode 100644 index 000000000000..5b29ad79ac18 --- /dev/null +++ b/validator/cpp/htmlparser/utf8.h @@ -0,0 +1,134 @@ +#ifndef HTMLPARSER__UTF8_H_ +#define HTMLPARSER__UTF8_H_ + +#include + +namespace htmlparser { + +// Is this an ascii character, that is byte is code-point in itself. +// (0..0x7f). +#define IS_ASCII(c) (c & 0x80) == 0 + +// Is this ascii char and a digit. +#define IS_DIGIT(c) (static_cast(c - 0x30) < 0xa) + +#define IS_ALPHABET(c) \ + ((static_cast(c - 0x41) < 0x1a) || \ + (static_cast(c - 0x61) < 0x1a)) + +// Is this code point a Unicode non-character +#define IS_CODEPOINT_NONCHAR(c) \ + ((c) >= 0xfdd0 && \ + ((c) <= 0fdef || ((c) & 0xfffe) == 0xfffe) && (c) <= 0x10ffff) + +#define IS_CODEPOINT_CHAR(c) \ + (static_cast(c) < 0Xd800 || \ + (0Xdfff < c && c <= 0x10ffff && !IS_UNICODE_NONCHAR(c))) + +// Counts number of continuation bytes for this codepoint. +#define NUM_TRAIL_BYTES(c) \ + (IS_LEAD_BYTE(c) ? \ + ((static_cast(c) >= 0xe0) + \ + (static_cast(c) >= 0xf0) + 1) \ + : 0) + +#define LEAD_BYTE_TRAIL_COUNT(c) \ + ((static_cast(c) >= 0xc2) + \ + (static_cast(c) >= 0xe0) + \ + (static_cast(c) >= 0xf0)) + +#define CODE_POINT_NUM_BYTES(c) \ + (static_cast(c) <= 0x7f ? 1 : \ + (static_cast(c) <= 0x7ff ? 2 : \ + (static_cast(c) <= 0xd7ff ? 3 : \ + (static_cast(c) <= 0xdfff || \ + static_cast(c) > 0x10ffff ? 0 : \ + (static_cast(c) <= 0xffff ? 3 : 4) \ + ) \ + ) \ + ) \ + ) + +#define READ_TRAIL_BYTE(c) \ + (((static_cast(c) & 0xc0) == 0x80) ? \ + (c) & 0x3f : 0) + +// Valid utf-8 byte sequences and their validity macros. +// Ref: Table 3.7 in https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf +// +-------------------+------------+-------------+------------+-------------+ +// | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte | +// +-------------------+------------+-------------+------------+-------------+ +// | U+0000..U+007F | 00..7F | | | | +// +-------------------+------------+-------------+------------+-------------+ +// | U+0080..U+07FF | C2..DF | 80..BF | | | +// +-------------------+------------+-------------+------------+-------------+ +// | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | +// +-------------------+------------+-------------+------------+-------------+ +// | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | +// +-------------------+------------+-------------+------------+-------------+ +// | U+D000..U+D7FF | ED | 80..9F | 80..BF | | +// +-------------------+------------+-------------+------------+-------------+ +// | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | +// +-------------------+------------+-------------+------------+-------------+ +// | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | +// +-------------------+------------+-------------+------------+-------------+ +// | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | +// +-------------------+------------+-------------+------------+-------------+ +// | U+100000..U+10FFFF| F4 | 80..8F | 80..BF | 80..BF | +// +-------------------+------------+-------------+------------+-------------+ + +static constexpr std::array k3ByteTrailByteValidity { + 0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, + 0x10, 0x30, 0x30 +}; + +static constexpr std::array k4ByteTrailByteValidity { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x0F, 0x0F, 0x0F, 0x00, + 0x00, 0x00, 0x00 +}; + +// Is utf-8 lead byte (0xc2..0xf4). +#define IS_UTF8_LEAD_BYTE(c) (static_cast(c - 0xc2) <= 0x32) + +// Is utf-8 trail byte (0x80..0xBF). +#define IS_UTF8_TRAIL_BYTE(c) (static_cast(c) < -0x40) + +// Is utf-8 trail second byte valid. +#define IS_UTF8_TRAIL_2ND_BYTE_VALID(lead_byte, trail_byte) \ + lead_byte < 0xe0 ? \ + IS_UTF8_TRAIL_BYTE(trail_byte) : \ + htmlparser::k3ByteTrailByteValidity[lead_byte & 0xf] & \ + (1 << (static_cast(trail_byte) >> 5)) + +#define IS_UTF8_TRAIL_3RD_BYTE_VALID(lead_byte, trail_byte) \ + lead_byte >= 0xf0 ? \ + htmlparser::k4ByteTrailByteValidity[static_cast(trail_byte) >> 4] & \ + (1 << (lead_byte & 7)) : 0 + +#define _DECODE_UTF8_2(c1, c2) \ + ((c1 & 0b11111) << 6) | READ_TRAIL_BYTE(c2) + +#define _DECODE_UTF8_3(c1, c2, c3) \ + ((c1 & 0b1111) << 12) | (c2 << 6) | c3 + +#define _DECODE_UTF8_4(c1, c2, c3, c4) \ + ((c1 & 0b111) << 18) | \ + (READ_TRAIL_BYTE(c2) << 12) | \ + (READ_TRAIL_BYTE(c3) << 6) | \ + READ_TRAIL_BYTE(c4) + +#define _DECODE_UTF8_X(x, A, B, C, D, FUNC, ...) FUNC + +#define TO_CODEPOINT(...) \ + _DECODE_UTF8_X(, ##__VA_ARGS__, \ + _DECODE_UTF8_4(__VA_ARGS__), \ + _DECODE_UTF8_3(__VA_ARGS__), \ + _DECODE_UTF8_2(__VA_ARGS__)) + +// (U+d800..U+dfff). +#define IS_SURROGATE(c) (((c) & 0xfffff800) == 0xd800) + +} // namespace htmlparser + + +#endif // HTMLPARSER__UTF8_H_ diff --git a/validator/cpp/htmlparser/utf8_test.cc b/validator/cpp/htmlparser/utf8_test.cc new file mode 100644 index 000000000000..3b11a73578ce --- /dev/null +++ b/validator/cpp/htmlparser/utf8_test.cc @@ -0,0 +1,203 @@ +#include "utf8.h" + +#include "gtest/gtest.h" + +TEST(UTF8Test, AsciiCharsTest) { + for (uint8_t i = 'a'; i <= 'z'; ++i) { + EXPECT_TRUE(IS_ASCII(i)); + EXPECT_FALSE(IS_DIGIT(i)); + EXPECT_TRUE(IS_ALPHABET(i)); + } + for (uint8_t i = 'A'; i <= 'Z'; ++i) { + EXPECT_TRUE(IS_ASCII(i)); + EXPECT_FALSE(IS_DIGIT(i)); + EXPECT_TRUE(IS_ALPHABET(i)); + } + for (uint8_t i = '0'; i <= '9'; ++i) { + EXPECT_TRUE(IS_ASCII(i)); + EXPECT_TRUE(IS_DIGIT(i)); + EXPECT_FALSE(IS_ALPHABET(i)); + } +} + +TEST(UTF8Test, DecodeUtf8SymbolTest) { + EXPECT_EQ(TO_CODEPOINT(0xf0, 0x9d, 0x8c, 0x86), 119558); + EXPECT_EQ(TO_CODEPOINT(0xe2, 0x8c, 0x98), 9112); + EXPECT_EQ(TO_CODEPOINT(0xc5, 0x9a), 346); +} + +TEST(UTF8Test, ReadContinuationByteTest) { + // First two bits 010... not a continuation byte. + EXPECT_EQ(0, READ_TRAIL_BYTE(0b11000001)); + // Mask first two valid continuation bits. + EXPECT_EQ(0b00111111, READ_TRAIL_BYTE(0b10111111)); +} + +TEST(UTF8Test, IsTrailByteTest) { + EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x9d)); + EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x8c)); + EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x86)); + EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x98)); + EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x9a)); + EXPECT_FALSE(IS_UTF8_TRAIL_BYTE(0xf0)); + EXPECT_FALSE(IS_UTF8_TRAIL_BYTE(0xe2)); + EXPECT_FALSE(IS_UTF8_TRAIL_BYTE(0xc5)); +} + +TEST(UTF8Test, IsLeadingByteTest) { + // Invalid bytes. + for (uint8_t i = 0; i < 194; ++i) { + EXPECT_FALSE(IS_UTF8_LEAD_BYTE(i)); + } + + // Valid 0xc2..0xf4. + for (uint8_t i = 0xc2; i < 0xf5; ++i) { + EXPECT_TRUE(IS_UTF8_LEAD_BYTE(i)); + } + + // Invalid bytes 0xf5 to 0xff. + for (uint8_t i = 245; i <= 254; ++i) { + EXPECT_FALSE(IS_UTF8_LEAD_BYTE(i)); + } + + EXPECT_FALSE(IS_UTF8_LEAD_BYTE(255)); +} + +TEST(UTF8Test, CodePointByteSequenceCountTest) { + EXPECT_EQ(0, LEAD_BYTE_TRAIL_COUNT('a')); + EXPECT_EQ(1, LEAD_BYTE_TRAIL_COUNT(0xc5)); + EXPECT_EQ(2, LEAD_BYTE_TRAIL_COUNT(0xe2)); + EXPECT_EQ(3, LEAD_BYTE_TRAIL_COUNT(0xf0)); + + // c2..df, 1 subsequent byte. + for (uint8_t i = 0xc2; i <= 0xdf; ++i) { + EXPECT_EQ(1, LEAD_BYTE_TRAIL_COUNT(i)); + } + + // e0..ef, 2 subsequent bytes. + for (uint8_t i = 0xe0; i <= 0xef; ++i) { + EXPECT_EQ(2, LEAD_BYTE_TRAIL_COUNT(i)); + } + + // f0..f4, 3 subsequent bytes. + for (uint8_t i = 0xf0; i <= 0xf4; ++i) { + EXPECT_EQ(3, LEAD_BYTE_TRAIL_COUNT(i)); + } +} + +TEST(UTF8Test, CodePointNumBytesTest) { + EXPECT_EQ(1, CODE_POINT_NUM_BYTES('a')); + EXPECT_EQ(2, CODE_POINT_NUM_BYTES(346 /*"Ś"*/)); + EXPECT_EQ(3, CODE_POINT_NUM_BYTES(9112)); + EXPECT_EQ(4, CODE_POINT_NUM_BYTES(119558 /*"𝌆"*/)); +} + +TEST(UTF8Test, 3ByteSequenceValidityTest) { + // Lead byte 0xc2..0xdf. + for (uint8_t i = 0xc2; i <= 0xdf; ++i) { + // Not a trail byte. + for (uint8_t j = 0; j < 0x80; ++j) { + EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); + } + for (uint8_t j = 0xbf + 1; j < 0xff; ++j) { + EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); + } + EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff)); + + // Valid range. + for (uint8_t j = 0x80; j <= 0xbf; ++j) { + EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); + } + } + + // Lead byte 0xe0. + // Not a trail byte. + uint8_t i = 0xe0; + for (uint8_t j = 0; j < 0xa0; ++j) { + EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); + } + for (uint8_t j = 0xa0; j <= 0xbf; ++j) { + EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); + } + // Bytes > 0xbf is invalid. + for (uint8_t j = 0xbf + 1; j < 0xff; ++j) { + EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); + } + EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff)); + + // Lead byte 0xe1..0xec. + for (uint8_t i = 0xe1; i <= 0xec; ++i) { + // Not a trail byte. + for (uint8_t j = 0; j < 0x80; ++j) { + EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); + } + for (uint8_t j = 0xbf + 1; j < 0xff; ++j) { + EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); + } + EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff)); + + // Valid range. + for (uint8_t j = 0x80; j <= 0xbf; ++j) { + EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); + } + } + + // Lead byte 0xed. + i = 0xed; + // Not a trail byte. + for (uint8_t j = 0; j < 0x80; ++j) { + EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); + } + for (uint8_t j = 0xbf + 1; j < 0xff; ++j) { + EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); + } + EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff)); + + // Valid range. + for (uint8_t j = 0x80; j <= 0x9f; ++j) { + EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); + } + // Invalid range. 0xa0..0xbf. + for (uint8_t j = 0xa0; j <= 0xbf; ++j) { + EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); + } + + // Lead byte 0xee..0xef. + for (uint8_t i = 0xee; i <= 0xef; ++i) { + // Not a trail byte. + for (uint8_t j = 0; j < 0x80; ++j) { + EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); + } + for (uint8_t j = 0xbf + 1; j < 0xff; ++j) { + EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); + } + EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff)); + + // Valid range. + for (uint8_t j = 0x80; j <= 0xbf; ++j) { + EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j)); + } + } +} + +TEST(UTF8Test, 4ByteSequenceValidityTest) { + // Not a 4 byte sequence lead byte. + for (uint8_t i = 0; i < 0xf0; ++i) { + for (uint8_t j = 0; j < 0xff; ++j) { + EXPECT_FALSE(IS_UTF8_TRAIL_3RD_BYTE_VALID(i, j)); + } + } + + // 4byte lead byte 0xf1..0xf3 + for (uint8_t i = 0xf1; i <= 0xf3; ++i) { + // Invalid trail byte. + for (uint8_t j = 0; j < 0x80; ++j) { + EXPECT_FALSE(IS_UTF8_TRAIL_3RD_BYTE_VALID(i, j)); + } + // Valid trail byte. + for (uint8_t j = 0x80; j <= 0xbf; ++j) { + EXPECT_TRUE(IS_UTF8_TRAIL_3RD_BYTE_VALID(i, j)); + } + } +} +