diff --git a/validator/cpp/htmlparser/utf8.h b/validator/cpp/htmlparser/utf8.h
new file mode 100644
index 000000000000..5b29ad79ac18
--- /dev/null
+++ b/validator/cpp/htmlparser/utf8.h
@@ -0,0 +1,134 @@
+#ifndef HTMLPARSER__UTF8_H_
+#define HTMLPARSER__UTF8_H_
+
+#include
+
+namespace htmlparser {
+
+// Is this an ascii character, that is byte is code-point in itself.
+// (0..0x7f).
+#define IS_ASCII(c) (c & 0x80) == 0
+
+// Is this ascii char and a digit.
+#define IS_DIGIT(c) (static_cast(c - 0x30) < 0xa)
+
+#define IS_ALPHABET(c) \
+ ((static_cast(c - 0x41) < 0x1a) || \
+ (static_cast(c - 0x61) < 0x1a))
+
+// Is this code point a Unicode non-character
+#define IS_CODEPOINT_NONCHAR(c) \
+ ((c) >= 0xfdd0 && \
+ ((c) <= 0fdef || ((c) & 0xfffe) == 0xfffe) && (c) <= 0x10ffff)
+
+#define IS_CODEPOINT_CHAR(c) \
+ (static_cast(c) < 0Xd800 || \
+ (0Xdfff < c && c <= 0x10ffff && !IS_UNICODE_NONCHAR(c)))
+
+// Counts number of continuation bytes for this codepoint.
+#define NUM_TRAIL_BYTES(c) \
+ (IS_LEAD_BYTE(c) ? \
+ ((static_cast(c) >= 0xe0) + \
+ (static_cast(c) >= 0xf0) + 1) \
+ : 0)
+
+#define LEAD_BYTE_TRAIL_COUNT(c) \
+ ((static_cast(c) >= 0xc2) + \
+ (static_cast(c) >= 0xe0) + \
+ (static_cast(c) >= 0xf0))
+
+#define CODE_POINT_NUM_BYTES(c) \
+ (static_cast(c) <= 0x7f ? 1 : \
+ (static_cast(c) <= 0x7ff ? 2 : \
+ (static_cast(c) <= 0xd7ff ? 3 : \
+ (static_cast(c) <= 0xdfff || \
+ static_cast(c) > 0x10ffff ? 0 : \
+ (static_cast(c) <= 0xffff ? 3 : 4) \
+ ) \
+ ) \
+ ) \
+ )
+
+#define READ_TRAIL_BYTE(c) \
+ (((static_cast(c) & 0xc0) == 0x80) ? \
+ (c) & 0x3f : 0)
+
+// Valid utf-8 byte sequences and their validity macros.
+// Ref: Table 3.7 in https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf
+// +-------------------+------------+-------------+------------+-------------+
+// | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte |
+// +-------------------+------------+-------------+------------+-------------+
+// | U+0000..U+007F | 00..7F | | | |
+// +-------------------+------------+-------------+------------+-------------+
+// | U+0080..U+07FF | C2..DF | 80..BF | | |
+// +-------------------+------------+-------------+------------+-------------+
+// | U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
+// +-------------------+------------+-------------+------------+-------------+
+// | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
+// +-------------------+------------+-------------+------------+-------------+
+// | U+D000..U+D7FF | ED | 80..9F | 80..BF | |
+// +-------------------+------------+-------------+------------+-------------+
+// | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
+// +-------------------+------------+-------------+------------+-------------+
+// | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
+// +-------------------+------------+-------------+------------+-------------+
+// | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
+// +-------------------+------------+-------------+------------+-------------+
+// | U+100000..U+10FFFF| F4 | 80..8F | 80..BF | 80..BF |
+// +-------------------+------------+-------------+------------+-------------+
+
+static constexpr std::array k3ByteTrailByteValidity {
+ 0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
+ 0x10, 0x30, 0x30
+};
+
+static constexpr std::array k4ByteTrailByteValidity {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x0F, 0x0F, 0x0F, 0x00,
+ 0x00, 0x00, 0x00
+};
+
+// Is utf-8 lead byte (0xc2..0xf4).
+#define IS_UTF8_LEAD_BYTE(c) (static_cast(c - 0xc2) <= 0x32)
+
+// Is utf-8 trail byte (0x80..0xBF).
+#define IS_UTF8_TRAIL_BYTE(c) (static_cast(c) < -0x40)
+
+// Is utf-8 trail second byte valid.
+#define IS_UTF8_TRAIL_2ND_BYTE_VALID(lead_byte, trail_byte) \
+ lead_byte < 0xe0 ? \
+ IS_UTF8_TRAIL_BYTE(trail_byte) : \
+ htmlparser::k3ByteTrailByteValidity[lead_byte & 0xf] & \
+ (1 << (static_cast(trail_byte) >> 5))
+
+#define IS_UTF8_TRAIL_3RD_BYTE_VALID(lead_byte, trail_byte) \
+ lead_byte >= 0xf0 ? \
+ htmlparser::k4ByteTrailByteValidity[static_cast(trail_byte) >> 4] & \
+ (1 << (lead_byte & 7)) : 0
+
+#define _DECODE_UTF8_2(c1, c2) \
+ ((c1 & 0b11111) << 6) | READ_TRAIL_BYTE(c2)
+
+#define _DECODE_UTF8_3(c1, c2, c3) \
+ ((c1 & 0b1111) << 12) | (c2 << 6) | c3
+
+#define _DECODE_UTF8_4(c1, c2, c3, c4) \
+ ((c1 & 0b111) << 18) | \
+ (READ_TRAIL_BYTE(c2) << 12) | \
+ (READ_TRAIL_BYTE(c3) << 6) | \
+ READ_TRAIL_BYTE(c4)
+
+#define _DECODE_UTF8_X(x, A, B, C, D, FUNC, ...) FUNC
+
+#define TO_CODEPOINT(...) \
+ _DECODE_UTF8_X(, ##__VA_ARGS__, \
+ _DECODE_UTF8_4(__VA_ARGS__), \
+ _DECODE_UTF8_3(__VA_ARGS__), \
+ _DECODE_UTF8_2(__VA_ARGS__))
+
+// (U+d800..U+dfff).
+#define IS_SURROGATE(c) (((c) & 0xfffff800) == 0xd800)
+
+} // namespace htmlparser
+
+
+#endif // HTMLPARSER__UTF8_H_
diff --git a/validator/cpp/htmlparser/utf8_test.cc b/validator/cpp/htmlparser/utf8_test.cc
new file mode 100644
index 000000000000..3b11a73578ce
--- /dev/null
+++ b/validator/cpp/htmlparser/utf8_test.cc
@@ -0,0 +1,203 @@
+#include "utf8.h"
+
+#include "gtest/gtest.h"
+
+TEST(UTF8Test, AsciiCharsTest) {
+ for (uint8_t i = 'a'; i <= 'z'; ++i) {
+ EXPECT_TRUE(IS_ASCII(i));
+ EXPECT_FALSE(IS_DIGIT(i));
+ EXPECT_TRUE(IS_ALPHABET(i));
+ }
+ for (uint8_t i = 'A'; i <= 'Z'; ++i) {
+ EXPECT_TRUE(IS_ASCII(i));
+ EXPECT_FALSE(IS_DIGIT(i));
+ EXPECT_TRUE(IS_ALPHABET(i));
+ }
+ for (uint8_t i = '0'; i <= '9'; ++i) {
+ EXPECT_TRUE(IS_ASCII(i));
+ EXPECT_TRUE(IS_DIGIT(i));
+ EXPECT_FALSE(IS_ALPHABET(i));
+ }
+}
+
+TEST(UTF8Test, DecodeUtf8SymbolTest) {
+ EXPECT_EQ(TO_CODEPOINT(0xf0, 0x9d, 0x8c, 0x86), 119558);
+ EXPECT_EQ(TO_CODEPOINT(0xe2, 0x8c, 0x98), 9112);
+ EXPECT_EQ(TO_CODEPOINT(0xc5, 0x9a), 346);
+}
+
+TEST(UTF8Test, ReadContinuationByteTest) {
+ // First two bits 010... not a continuation byte.
+ EXPECT_EQ(0, READ_TRAIL_BYTE(0b11000001));
+ // Mask first two valid continuation bits.
+ EXPECT_EQ(0b00111111, READ_TRAIL_BYTE(0b10111111));
+}
+
+TEST(UTF8Test, IsTrailByteTest) {
+ EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x9d));
+ EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x8c));
+ EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x86));
+ EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x98));
+ EXPECT_TRUE(IS_UTF8_TRAIL_BYTE(0x9a));
+ EXPECT_FALSE(IS_UTF8_TRAIL_BYTE(0xf0));
+ EXPECT_FALSE(IS_UTF8_TRAIL_BYTE(0xe2));
+ EXPECT_FALSE(IS_UTF8_TRAIL_BYTE(0xc5));
+}
+
+TEST(UTF8Test, IsLeadingByteTest) {
+ // Invalid bytes.
+ for (uint8_t i = 0; i < 194; ++i) {
+ EXPECT_FALSE(IS_UTF8_LEAD_BYTE(i));
+ }
+
+ // Valid 0xc2..0xf4.
+ for (uint8_t i = 0xc2; i < 0xf5; ++i) {
+ EXPECT_TRUE(IS_UTF8_LEAD_BYTE(i));
+ }
+
+ // Invalid bytes 0xf5 to 0xff.
+ for (uint8_t i = 245; i <= 254; ++i) {
+ EXPECT_FALSE(IS_UTF8_LEAD_BYTE(i));
+ }
+
+ EXPECT_FALSE(IS_UTF8_LEAD_BYTE(255));
+}
+
+TEST(UTF8Test, CodePointByteSequenceCountTest) {
+ EXPECT_EQ(0, LEAD_BYTE_TRAIL_COUNT('a'));
+ EXPECT_EQ(1, LEAD_BYTE_TRAIL_COUNT(0xc5));
+ EXPECT_EQ(2, LEAD_BYTE_TRAIL_COUNT(0xe2));
+ EXPECT_EQ(3, LEAD_BYTE_TRAIL_COUNT(0xf0));
+
+ // c2..df, 1 subsequent byte.
+ for (uint8_t i = 0xc2; i <= 0xdf; ++i) {
+ EXPECT_EQ(1, LEAD_BYTE_TRAIL_COUNT(i));
+ }
+
+ // e0..ef, 2 subsequent bytes.
+ for (uint8_t i = 0xe0; i <= 0xef; ++i) {
+ EXPECT_EQ(2, LEAD_BYTE_TRAIL_COUNT(i));
+ }
+
+ // f0..f4, 3 subsequent bytes.
+ for (uint8_t i = 0xf0; i <= 0xf4; ++i) {
+ EXPECT_EQ(3, LEAD_BYTE_TRAIL_COUNT(i));
+ }
+}
+
+TEST(UTF8Test, CodePointNumBytesTest) {
+ EXPECT_EQ(1, CODE_POINT_NUM_BYTES('a'));
+ EXPECT_EQ(2, CODE_POINT_NUM_BYTES(346 /*"Ś"*/));
+ EXPECT_EQ(3, CODE_POINT_NUM_BYTES(9112));
+ EXPECT_EQ(4, CODE_POINT_NUM_BYTES(119558 /*"𝌆"*/));
+}
+
+TEST(UTF8Test, 3ByteSequenceValidityTest) {
+ // Lead byte 0xc2..0xdf.
+ for (uint8_t i = 0xc2; i <= 0xdf; ++i) {
+ // Not a trail byte.
+ for (uint8_t j = 0; j < 0x80; ++j) {
+ EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
+ }
+ for (uint8_t j = 0xbf + 1; j < 0xff; ++j) {
+ EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
+ }
+ EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff));
+
+ // Valid range.
+ for (uint8_t j = 0x80; j <= 0xbf; ++j) {
+ EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
+ }
+ }
+
+ // Lead byte 0xe0.
+ // Not a trail byte.
+ uint8_t i = 0xe0;
+ for (uint8_t j = 0; j < 0xa0; ++j) {
+ EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
+ }
+ for (uint8_t j = 0xa0; j <= 0xbf; ++j) {
+ EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
+ }
+ // Bytes > 0xbf is invalid.
+ for (uint8_t j = 0xbf + 1; j < 0xff; ++j) {
+ EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
+ }
+ EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff));
+
+ // Lead byte 0xe1..0xec.
+ for (uint8_t i = 0xe1; i <= 0xec; ++i) {
+ // Not a trail byte.
+ for (uint8_t j = 0; j < 0x80; ++j) {
+ EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
+ }
+ for (uint8_t j = 0xbf + 1; j < 0xff; ++j) {
+ EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
+ }
+ EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff));
+
+ // Valid range.
+ for (uint8_t j = 0x80; j <= 0xbf; ++j) {
+ EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
+ }
+ }
+
+ // Lead byte 0xed.
+ i = 0xed;
+ // Not a trail byte.
+ for (uint8_t j = 0; j < 0x80; ++j) {
+ EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
+ }
+ for (uint8_t j = 0xbf + 1; j < 0xff; ++j) {
+ EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
+ }
+ EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff));
+
+ // Valid range.
+ for (uint8_t j = 0x80; j <= 0x9f; ++j) {
+ EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
+ }
+ // Invalid range. 0xa0..0xbf.
+ for (uint8_t j = 0xa0; j <= 0xbf; ++j) {
+ EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
+ }
+
+ // Lead byte 0xee..0xef.
+ for (uint8_t i = 0xee; i <= 0xef; ++i) {
+ // Not a trail byte.
+ for (uint8_t j = 0; j < 0x80; ++j) {
+ EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
+ }
+ for (uint8_t j = 0xbf + 1; j < 0xff; ++j) {
+ EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
+ }
+ EXPECT_FALSE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, 0xff));
+
+ // Valid range.
+ for (uint8_t j = 0x80; j <= 0xbf; ++j) {
+ EXPECT_TRUE(IS_UTF8_TRAIL_2ND_BYTE_VALID(i, j));
+ }
+ }
+}
+
+TEST(UTF8Test, 4ByteSequenceValidityTest) {
+ // Not a 4 byte sequence lead byte.
+ for (uint8_t i = 0; i < 0xf0; ++i) {
+ for (uint8_t j = 0; j < 0xff; ++j) {
+ EXPECT_FALSE(IS_UTF8_TRAIL_3RD_BYTE_VALID(i, j));
+ }
+ }
+
+ // 4byte lead byte 0xf1..0xf3
+ for (uint8_t i = 0xf1; i <= 0xf3; ++i) {
+ // Invalid trail byte.
+ for (uint8_t j = 0; j < 0x80; ++j) {
+ EXPECT_FALSE(IS_UTF8_TRAIL_3RD_BYTE_VALID(i, j));
+ }
+ // Valid trail byte.
+ for (uint8_t j = 0x80; j <= 0xbf; ++j) {
+ EXPECT_TRUE(IS_UTF8_TRAIL_3RD_BYTE_VALID(i, j));
+ }
+ }
+}
+