From 6e751a441e0a207e01df6c6c9d18d452d5acb115 Mon Sep 17 00:00:00 2001
From: Allan Banaag <banaag@google.com>
Date: Mon, 5 Feb 2024 14:17:49 -0800
Subject: [PATCH] Sync for validator cpp engine and cpp htmlparser (#39797)

* Migrate from `cfg = "host"` to `cfg = "exec"`

PiperOrigin-RevId: 488423550

* ...Fix risky undefined behavior when host is empty....

PiperOrigin-RevId: 488505778

* Allow ampjs domain for serving amp js files.

PiperOrigin-RevId: 489206825

* No description.

PiperOrigin-RevId: 491684683

* Explicitly cast a char32_t to uint32_t before streaming it to output.

In C++17, the char32_t is silently formatted as a number, but in C++20, overloads of << accepting wide character types are deleted, breaking compilation of this code.

The value also was formatted in decimal afaict, but all other values here are formatted in hex, so do that here as well.

PiperOrigin-RevId: 499523425

* Fix AMP for Email spec URLs

Some of these URLs are pointing to the AMP for Website documentation.

PiperOrigin-RevId: 508406092

* Replace greggrothaus (now a xoogler) with Erwin in OWNERs file.

Edited README to:
- Move htmlparser out of beta. It is pretty stable and running in several production services since 2019.
- Remove references of being maintained by AMP working group, as it will be maintained by ex-engineers of working group.

PiperOrigin-RevId: 518921324

* Internal Code Change

PiperOrigin-RevId: 540320460

* Internal Code Change

PiperOrigin-RevId: 540323049

* Update README.md

---------

Co-authored-by: Googler <noreply@google.com>
Co-authored-by: Amaltas Bohra <amaltas@google.com>
---
 validator/cpp/engine/embed_data.bzl           |  2 +-
 validator/cpp/engine/parse-layout-sizes.h     |  2 +-
 validator/cpp/engine/parse-srcset_test.cc     |  2 +-
 validator/cpp/engine/testing-utils.cc         |  2 --
 validator/cpp/engine/utf8-util_test.cc        |  2 +-
 validator/cpp/engine/validator-internal.cc    | 36 +++++++++----------
 validator/cpp/engine/validator_test.cc        | 32 +++++++++--------
 validator/cpp/engine/wasm/BUILD               |  6 ----
 validator/cpp/engine/wasm/validator.js        |  2 +-
 validator/cpp/htmlparser/BUILD                |  4 +--
 validator/cpp/htmlparser/README.md            | 17 +--------
 validator/cpp/htmlparser/allocator.h          |  8 ++---
 .../cpp/htmlparser/bin/entitytablegen.cc      |  2 +-
 .../cpp/htmlparser/css/parse-css-urls.cc      |  2 +-
 validator/cpp/htmlparser/css/parse-css.cc     |  6 ++--
 validator/cpp/htmlparser/css/parse-css.h      |  2 +-
 validator/cpp/htmlparser/data/CaseFolding.txt | 10 +++---
 validator/cpp/htmlparser/data/amptags.txt     |  1 -
 validator/cpp/htmlparser/data/jsongrammar.txt |  2 +-
 validator/cpp/htmlparser/fileutil.cc          |  2 +-
 .../cpp/htmlparser/grammar/tablebuilder.h     |  4 +--
 validator/cpp/htmlparser/json/types.h         |  4 +--
 validator/cpp/htmlparser/node.h               |  2 +-
 validator/cpp/htmlparser/parser.cc            |  4 +--
 validator/cpp/htmlparser/parser_test.cc       |  2 +-
 validator/cpp/htmlparser/strings.cc           |  6 ++--
 validator/cpp/htmlparser/strings.h            |  4 +--
 validator/cpp/htmlparser/strings_test.cc      |  2 +-
 validator/cpp/htmlparser/url.cc               |  3 +-
 validator/cpp/htmlparser/url.h                |  2 +-
 validator/cpp/htmlparser/url_test.cc          |  6 ++--
 .../htmlparser/validators/ipaddress_test.cc   |  2 +-
 .../cpp/htmlparser/validators/json_test.cc    |  2 +-
 .../validators/supported_media_query_test.cc  |  2 +-
 34 files changed, 84 insertions(+), 103 deletions(-)

diff --git a/validator/cpp/engine/embed_data.bzl b/validator/cpp/engine/embed_data.bzl
index ff9ee299a9d9..ba0f40fd9867 100644
--- a/validator/cpp/engine/embed_data.bzl
+++ b/validator/cpp/engine/embed_data.bzl
@@ -39,7 +39,7 @@ embed_data = rule(
         ),
         "header_generator": attr.label(
             executable = True,
-            cfg = "host",
+            cfg = "exec",
             allow_files = True,
             default = Label(
                 "//cpp/engine/scripts:filecontents_to_cpp_header",
diff --git a/validator/cpp/engine/parse-layout-sizes.h b/validator/cpp/engine/parse-layout-sizes.h
index 130e20ca8654..2150cf858e03 100644
--- a/validator/cpp/engine/parse-layout-sizes.h
+++ b/validator/cpp/engine/parse-layout-sizes.h
@@ -9,7 +9,7 @@ namespace amp::validator::parse_layout_sizes {
 
 // WARNING: This code is still in development and not ready to be used.
 
-// This is a single representation for the CssSizes object.
+// This is a single represenation for the CssSizes object.
 // It consists of at least a valid size and a possible media condition.
 // See https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#Attributes
 struct CssSize {
diff --git a/validator/cpp/engine/parse-srcset_test.cc b/validator/cpp/engine/parse-srcset_test.cc
index 97bf3435e9ab..894ae8bf7db8 100644
--- a/validator/cpp/engine/parse-srcset_test.cc
+++ b/validator/cpp/engine/parse-srcset_test.cc
@@ -150,7 +150,7 @@ TEST(ParseSrcsetTest, LeadingAndTrailingCommasAndCommasInUrl) {
               EqCandidates({{"example.com/,/,/,/,50w", "1x"}}));
 }
 
-TEST(ParseSrcsetTest, NoWhitespace) {
+TEST(ParseSrcsetTest, NoWhitepsace) {
   SrcsetParsingResult result = ParseSourceSet("image 100w,image 50w");
   EXPECT_TRUE(result.success);
   EXPECT_THAT(result.srcset_images,
diff --git a/validator/cpp/engine/testing-utils.cc b/validator/cpp/engine/testing-utils.cc
index b6e115d30963..f6cc635024dc 100644
--- a/validator/cpp/engine/testing-utils.cc
+++ b/validator/cpp/engine/testing-utils.cc
@@ -97,8 +97,6 @@ const std::map<std::string, TestCase>& TestCases() {
            "external/amphtml-extensions/*/*/test/*.html",
             &html_files)) << "Test cases file pattern not found.";
 
-    CHECK(!html_files.empty()) << "Validator test cases are empty. Will not proceed.";
-    
     std::sort(html_files.begin(), html_files.end());
     for (const std::string& html_file : html_files) {
       if (html_file.find("/js_only/") != std::string::npos) continue;
diff --git a/validator/cpp/engine/utf8-util_test.cc b/validator/cpp/engine/utf8-util_test.cc
index a2a2472a646c..0f5f6b9bf783 100644
--- a/validator/cpp/engine/utf8-util_test.cc
+++ b/validator/cpp/engine/utf8-util_test.cc
@@ -14,7 +14,7 @@ TEST(Utf8UtilTest, Utf16StrLen) {
   // It's 34 bytes long and 22 utf-8 characters long. Javascript uses UTF16
   // strings and string lengths.
   // The chars in Iñtërnâtiônàlizætiøn vary between 1 and 2 byte lengths, all
-  // javascript 1-char lengths. The ⚡ is a 3-byte length character, with a
+  // javascript 1-char lenghts. The ⚡ is a 3-byte length character, with a
   // 1-char javascript length. Finally the 💩 is a 4-byte length character with
   // a 2-char javascript length.
   EXPECT_EQ(Utf16StrLen("Iñtërnâtiônàlizætiøn☃💩"), 23);
diff --git a/validator/cpp/engine/validator-internal.cc b/validator/cpp/engine/validator-internal.cc
index fd62ee99e249..63856a5eb540 100644
--- a/validator/cpp/engine/validator-internal.cc
+++ b/validator/cpp/engine/validator-internal.cc
@@ -644,9 +644,9 @@ struct ParsedReferencePoint {
 class ParsedReferencePoints {
  public:
   ParsedReferencePoints() : parent_(nullptr) {}
-  ParsedReferencePoints(
-      const TagSpec& parent,
-      const unordered_map<std::string, int32_t>& tag_spec_ids_by_tag_spec_name)
+  ParsedReferencePoints(const TagSpec& parent,
+                        const absl::flat_hash_map<std::string, int32_t>&
+                            tag_spec_ids_by_tag_spec_name)
       : parent_(&parent) {
     for (const ReferencePoint& p : parent.reference_points()) {
       auto iter = tag_spec_ids_by_tag_spec_name.find(p.tag_spec_name());
@@ -1041,11 +1041,11 @@ RecordValidated ShouldRecordTagspecValidated(
 // which is unique within its context, the ParsedValidatorRules.
 class ParsedTagSpec {
  public:
-  ParsedTagSpec(
-      ParsedAttrSpecs* parsed_attr_specs,
-      const unordered_map<std::string, int32_t>& tag_spec_ids_by_tag_spec_name,
-      RecordValidated should_record_tagspec_validated, const TagSpec* spec,
-      int32_t id)
+  ParsedTagSpec(ParsedAttrSpecs* parsed_attr_specs,
+                const absl::flat_hash_map<std::string, int32_t>&
+                    tag_spec_ids_by_tag_spec_name,
+                RecordValidated should_record_tagspec_validated,
+                const TagSpec* spec, int32_t id)
       : spec_(spec),
         id_(id),
         reference_points_(*spec, tag_spec_ids_by_tag_spec_name),
@@ -1192,7 +1192,7 @@ class ParsedTagSpec {
 
   // Whether or not the tag should be recorded via
   // Context->RecordTagspecValidated if it was validated
-  // successfully. For performance, this is only done for tags that
+  // successfullly. For performance, this is only done for tags that
   // are mandatory, unique, or possibly required by some other tag.
   RecordValidated ShouldRecordTagspecValidated() const {
     return should_record_tagspec_validated_;
@@ -1218,7 +1218,7 @@ class ParsedTagSpec {
 
   const set<int32_t>& implicit_attrspecs() const { return implicit_attrspecs_; }
 
-  const unordered_map<std::string, int32_t>& attr_ids_by_name() const {
+  const absl::flat_hash_map<std::string, int32_t>& attr_ids_by_name() const {
     return attr_ids_by_name_;
   }
 
@@ -1240,7 +1240,7 @@ class ParsedTagSpec {
   bool is_reference_point_;
   bool is_type_json_ = false;
   bool contains_url_ = false;
-  unordered_map<std::string, int32_t> attr_ids_by_name_;
+  absl::flat_hash_map<std::string, int32_t> attr_ids_by_name_;
   vector<TypeIdentifier> disabled_by_;
   vector<TypeIdentifier> enabled_by_;
   vector<int32_t> mandatory_attr_ids_;
@@ -1284,7 +1284,7 @@ std::string TagSpecUrl(const TagSpec& spec) {
     return StrCat(extension_spec_url_prefix, spec.extension_spec().name());
   if (spec.requires_extension_size() > 0)
     // Return the first |requires_extension|, which should be the most
-    // representative.
+    // representitive.
     return StrCat(extension_spec_url_prefix, spec.requires_extension(0));
 
   return "";
@@ -2476,7 +2476,7 @@ class Context {
     if (!tag_result.best_match_tag_spec) return;
     const ParsedTagSpec* parsed_tag_spec = tag_result.best_match_tag_spec;
     if (!parsed_tag_spec->AttrsCanSatisfyExtension()) return;
-    const unordered_map<std::string, int32_t>& attr_ids_by_name =
+    const absl::flat_hash_map<std::string, int32_t>& attr_ids_by_name =
         parsed_tag_spec->attr_ids_by_name();
     ExtensionsContext* extensions_ctx = mutable_extensions();
     for (const ParsedHtmlTagAttr& attr : encountered_tag.Attributes()) {
@@ -2834,11 +2834,11 @@ class InvalidRuleVisitor : public htmlparser::css::RuleVisitor {
 class InvalidDeclVisitor : public htmlparser::css::RuleVisitor {
  public:
   InvalidDeclVisitor(const ParsedDocCssSpec& css_spec, Context* context,
-                     const std::string& tag_descriptive_name,
+                     const std::string& tag_decriptive_name,
                      ValidationResult* result)
       : css_spec_(css_spec),
         context_(context),
-        tag_descriptive_name_(tag_descriptive_name),
+        tag_descriptive_name_(tag_decriptive_name),
         result_(result) {}
 
   void VisitDeclaration(
@@ -4412,7 +4412,7 @@ void ValidateAttributes(const ParsedTagSpec& parsed_tag_spec,
   set<std::string_view> mandatory_anyofs_seen;
   vector<const ParsedAttrTriggerSpec*> parsed_trigger_specs;
   set<int32_t> attrspecs_validated;
-  const unordered_map<std::string, int32_t>& attr_ids_by_name =
+  const absl::flat_hash_map<std::string, int32_t>& attr_ids_by_name =
       parsed_tag_spec.attr_ids_by_name();
 
   for (const ParsedHtmlTagAttr& attr : encountered_tag.Attributes()) {
@@ -4717,7 +4717,7 @@ ParsedValidatorRules::ParsedValidatorRules(HtmlFormat::Code html_format)
   // |tag_spec_names_to_track| to identify those tagspecs that are
   // referenced by others via "also_requires_tag".  The ParsedTagSpec
   // constructor completes this translation to ids.
-  unordered_map<std::string, int32_t> tag_spec_ids_by_tag_spec_name;
+  absl::flat_hash_map<std::string, int32_t> tag_spec_ids_by_tag_spec_name;
   unordered_set<std::string> tag_spec_names_to_track;
   for (int ii = 0; ii < rules_.tags_size(); ++ii) {
     const TagSpec& tag = rules_.tags(ii);
@@ -5622,7 +5622,7 @@ void ReferencePointMatcher::RecordMatch(const ParsedTagSpec& reference_point) {
 
 void ReferencePointMatcher::ExitParentTag(const Context& context,
                                           ValidationResult* result) const {
-  absl::node_hash_map<int32_t, int32_t> reference_point_by_count;
+  absl::flat_hash_map<int32_t, int32_t> reference_point_by_count;
   for (int32_t r : reference_points_matched_) ++reference_point_by_count[r];
   for (const ParsedReferencePoint& p : *parsed_reference_points_) {
     if (p.point->mandatory() && reference_point_by_count.find(p.tag_spec_id) ==
diff --git a/validator/cpp/engine/validator_test.cc b/validator/cpp/engine/validator_test.cc
index a4421cf7f84d..545f14abbe59 100644
--- a/validator/cpp/engine/validator_test.cc
+++ b/validator/cpp/engine/validator_test.cc
@@ -481,8 +481,8 @@ TEST(ValidatorTest, TestCssLengthAmpEmail) {
         ":13:2 The author stylesheet specified in tag 'style amp-custom' "
         "is too long - document contains 75001 bytes whereas the "
         "limit is 75000 "
-        "bytes. (see https://amp.dev/documentation/guides-and-tutorials/email/learn/"
-        "spec/amphtml#maximum-size)");
+        "bytes. (see https://amp.dev/documentation/guides-and-tutorials/email/"
+        "learn/spec/amphtml#maximum-size)");
     EXPECT_EQ(expected_output, output) << "test case " << test_case_name;
   }
 
@@ -527,7 +527,8 @@ TEST(ValidatorTest, TestCssLengthAmpEmail) {
         ":19:6 The author stylesheet specified in tag 'style amp-custom' "
         "and the combined inline styles is too large - document contains 75010 "
         "bytes whereas the limit is 75000 bytes. (see https://amp.dev/"
-        "documentation/guides-and-tutorials/email/learn/spec/amphtml#maximum-size)");
+        "documentation/guides-and-tutorials/email/learn/spec/amphtml"
+        "#maximum-size)");
     EXPECT_EQ(expected_output, output) << "test case " << test_case_name;
   }
 
@@ -555,7 +556,8 @@ TEST(ValidatorTest, TestCssLengthAmpEmail) {
         ":7519:6 The author stylesheet specified in tag 'style amp-custom' "
         "and the combined inline styles is too large - document contains 75014 "
         "bytes whereas the limit is 75000 bytes. (see https://amp.dev/"
-        "documentation/guides-and-tutorials/email/learn/spec/amphtml#maximum-size)");
+        "documentation/guides-and-tutorials/email/learn/spec/amphtml"
+        "#maximum-size)");
     EXPECT_EQ(expected_output, output) << "test case " << test_case_name;
   }
 
@@ -649,8 +651,8 @@ TEST(ValidatorTest, TestCssLengthAmpEmailStrict) {
         ":13:2 The author stylesheet specified in tag 'style amp-custom' "
         "is too long - document contains 75001 bytes whereas the "
         "limit is 75000 "
-        "bytes. (see https://amp.dev/documentation/guides-and-tutorials/email/learn/"
-        "spec/amphtml#maximum-size)");
+        "bytes. (see https://amp.dev/documentation/guides-and-tutorials/email/"
+        "learn/spec/amphtml#maximum-size)");
     EXPECT_EQ(expected_output, output) << "test case " << test_case_name;
   }
 
@@ -681,7 +683,8 @@ TEST(ValidatorTest, TestCssLengthAmpEmailStrict) {
         ":19:6 The author stylesheet specified in tag 'style amp-custom' "
         "and the combined inline styles is too large - document contains 75010 "
         "bytes whereas the limit is 75000 bytes. (see https://amp.dev/"
-        "documentation/guides-and-tutorials/email/learn/spec/amphtml#maximum-size)");
+        "documentation/guides-and-tutorials/email/learn/spec/amphtml"
+        "#maximum-size)");
     EXPECT_EQ(expected_output, output) << "test case " << test_case_name;
   }
 
@@ -701,7 +704,8 @@ TEST(ValidatorTest, TestCssLengthAmpEmailStrict) {
         ":3769:6 The author stylesheet specified in tag 'style amp-custom' "
         "and the combined inline styles is too large - document contains 75014 "
         "bytes whereas the limit is 75000 bytes. (see https://amp.dev/"
-        "documentation/guides-and-tutorials/email/learn/spec/amphtml#maximum-size)");
+        "documentation/guides-and-tutorials/email/learn/spec/amphtml"
+        "#maximum-size)");
     EXPECT_EQ(expected_output, output) << "test case " << test_case_name;
   }
 
@@ -730,12 +734,12 @@ TEST(ValidatorTest, TestCssLengthAmpEmailStrict) {
     std::string output = RenderResult(
         /*filename=*/test_case_name,
         amp::validator::Validate(test_html, HtmlFormat::AMP4EMAIL));
-    std::string expected_output =
-        StrCat("FAIL\n", test_case_name,
-               ":17:2 The inline style specified in tag 'div' is too long - it "
-               "contains 1001 bytes whereas the limit is 1000 bytes. (see "
-               "https://amp.dev/documentation/guides-and-tutorials/email/learn/spec/"
-               "amphtml#maximum-size)");
+    std::string expected_output = StrCat(
+        "FAIL\n", test_case_name,
+        ":17:2 The inline style specified in tag 'div' is too long - it "
+        "contains 1001 bytes whereas the limit is 1000 bytes. (see "
+        "https://amp.dev/documentation/guides-and-tutorials/email/learn/spec/"
+        "amphtml#maximum-size)");
     EXPECT_EQ(expected_output, output) << "test case " << test_case_name;
   }
 }
diff --git a/validator/cpp/engine/wasm/BUILD b/validator/cpp/engine/wasm/BUILD
index e6ddc86cc49f..4253cb6e5a2d 100644
--- a/validator/cpp/engine/wasm/BUILD
+++ b/validator/cpp/engine/wasm/BUILD
@@ -1,7 +1,6 @@
 # Wraps AMP Validator into a WebAssembly library,
 # which can be used by javascript files.
 
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@emsdk//emscripten_toolchain:wasm_rules.bzl", "wasm_cc_binary")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_js_binary", "closure_js_library")
 
@@ -73,8 +72,3 @@ closure_js_binary(
         ":validator_js_lib",
     ],
 )
-
-build_test(
-    name = "validator_js_test",
-    targets = [":validator_js_bin"],
-)
diff --git a/validator/cpp/engine/wasm/validator.js b/validator/cpp/engine/wasm/validator.js
index e4008ede60ad..5014482505fa 100644
--- a/validator/cpp/engine/wasm/validator.js
+++ b/validator/cpp/engine/wasm/validator.js
@@ -83,7 +83,7 @@ function digitizeValidationErrorFields(error) {
 /**
  * When transforming validation errors and validation results from jspb to plain
  * objects, the protobuf base64 string is also attached to the output.
- * Hence when a plain object needs to be transformed back to protobuf,
+ * Hence when a plain object neeeds to be transformed back to protobuf,
  * the attached base64 could be directly used.
  */
 const PB_BASE64 = '_PB_BASE64';
diff --git a/validator/cpp/htmlparser/BUILD b/validator/cpp/htmlparser/BUILD
index 3d31520f96a2..78de2920159d 100644
--- a/validator/cpp/htmlparser/BUILD
+++ b/validator/cpp/htmlparser/BUILD
@@ -71,7 +71,7 @@ cc_test(
 )
 
 # Similar to go lang's defer statement. Defers the execution of statement
-# until in which it is declared goes out of scope.
+# until in which it is decalred goes out of scope.
 cc_library(
     name = "defer",
     hdrs = [
@@ -80,7 +80,7 @@ cc_library(
     copts = ["-std=c++17"],
 )
 
-# Helper library declares various doctype constants and a utility function to
+# Helper library decalres various doctype constants and a utility function to
 # parse doctype string and extract various components in it.
 cc_library(
     name = "doctype",
diff --git a/validator/cpp/htmlparser/README.md b/validator/cpp/htmlparser/README.md
index 53bd5637fea5..b181218b06df 100644
--- a/validator/cpp/htmlparser/README.md
+++ b/validator/cpp/htmlparser/README.md
@@ -1,21 +1,6 @@
 # HTML Parser
 
-This is an HTML5 compliant parser written in C++. It was created to be used by
-the
-[AMPHTML Validator](https://github.com/ampproject/amphtml/tree/main/validator)
-to standardize how AMPHTML documents should be parsed for AMP validation.
-
-## Maintainers
-
-This parser is maintained by the [AMP Working Group](https://amp.dev/community/working-groups/amp4email/):
-[Caching](https://amp.dev/community/working-groups/caching/)
-
-## Current Status
-
-This parser is in active development and has several outstanding TODOs.
-These TODOs may cause certain parsing tests to fail. Those tests have been
-excluded until the TODOs are resolved. See htmldataset_test.cc for a list of
-those tests.
+This is an HTML5 compliant parser written in C++.
 
 ## Building and Testing with Bazel
 
diff --git a/validator/cpp/htmlparser/allocator.h b/validator/cpp/htmlparser/allocator.h
index 09e9d4b9dd50..0620c4ed32fb 100644
--- a/validator/cpp/htmlparser/allocator.h
+++ b/validator/cpp/htmlparser/allocator.h
@@ -62,7 +62,7 @@
 // is naturally aligned if the address used to identify it has an 8-byte
 // alignment.
 //
-// Following data structure contains members totaling 13 bytes, but it's actual
+// Following data struture contains members totaling 13 bytes, but it's actual
 // size is 24 bytes due to 8 byte alignment.
 //
 // Alignment is always equal to the largest sized element in the structure.
@@ -201,10 +201,10 @@ class Allocator {
   Allocator& operator=(const Allocator&) = delete;
 
   // Allocates memory of same size required to construct object of type T.
-  // Returns nullptr if allocation failed.
+  // Returns nullptr if alloction failed.
   void* Allocate() {
     // Checks if remaining bytes in block are less than object size, or
-    // remaining bytes after alignment is less than object size.
+    // reamining bytes after alignment is less than object size.
     // Add a new block.
     if (object_size_ > remaining_ || !AlignFreeAddress()) {
       if (!NewBlock()) return nullptr;
@@ -338,7 +338,7 @@ class Allocator {
   }
 
   // If the block's address is not aligned, moves the pointer to the address
-  // that is multiple of alignment_.
+  // that is multiple of aligment_.
   bool AlignFreeAddress() {
     // Checks how many bytes to skip to be at the correct alignment.
     if (const std::size_t skip =
diff --git a/validator/cpp/htmlparser/bin/entitytablegen.cc b/validator/cpp/htmlparser/bin/entitytablegen.cc
index 20463ab0b9d0..95810636a429 100644
--- a/validator/cpp/htmlparser/bin/entitytablegen.cc
+++ b/validator/cpp/htmlparser/bin/entitytablegen.cc
@@ -139,7 +139,7 @@ int main(int argc, char** argv) {
 
       if ((code_point & 0xffffff80) == 0) {  // 1 byte sequence.
         // 0b0xxxxxx.
-        fd << "\\x" << code_point;
+        fd << "\\x" << std::hex << static_cast<uint32_t>(code_point);
       } else if ((code_point & 0xfffff800) == 0) {  // 2 byte sequence.
         // 0b110xxxxx 0b10xxxxxx.
         fd << "\\x" << std::hex << ((code_point >> 6) | 0xc0)
diff --git a/validator/cpp/htmlparser/css/parse-css-urls.cc b/validator/cpp/htmlparser/css/parse-css-urls.cc
index 75ce200899ae..9d2ae3d3033d 100644
--- a/validator/cpp/htmlparser/css/parse-css-urls.cc
+++ b/validator/cpp/htmlparser/css/parse-css-urls.cc
@@ -47,7 +47,7 @@ void Preprocess(vector<char32_t>* codepoints) {
         out.push_back('\n');
         last_codepoint_was_cr = true;
         break;
-      case '\f':  // also known as form feed (FF)
+      case '\f':  // also knwon as form feed (FF)
         out.push_back('\n');
         last_codepoint_was_cr = false;
         break;
diff --git a/validator/cpp/htmlparser/css/parse-css.cc b/validator/cpp/htmlparser/css/parse-css.cc
index 7e0b4aa9ab26..4df12b588e33 100644
--- a/validator/cpp/htmlparser/css/parse-css.cc
+++ b/validator/cpp/htmlparser/css/parse-css.cc
@@ -85,7 +85,7 @@ const std::string& Token::StringValue() const {
 }
 
 std::string Token::ToString() const {
-  // The following are overridden in their class: AT_KEYWORD, CLOSE_CURLY,
+  // The following are overriden in their class: AT_KEYWORD, CLOSE_CURLY,
   // CLOSE_PAREN, CLOSE_SQUARE, DELIM, DIMENSION, FUNCTION_TOKEN, IDENT,
   // NUMBER, OPEN_CURLY, OPEN_PAREN, OPEN_SQUARE, PERCENTAGE, STRING, URL
   switch (Type()) {
@@ -343,7 +343,7 @@ bool Whitespace(char32_t code) {
 char32_t kMaximumallowedcodepoint = 0x10ffff;
 
 // A MarkedPosition object saves position information from the tokenizer
-// provided as |line| and |col| to the constructor and can later write that
+// rovided as |line| and |col| to the constructor and can later write that
 // position back to a Token object.
 class MarkedPosition {
  public:
@@ -2471,7 +2471,7 @@ CombinatorType::Code CombinatorTypeForToken(const Token& token) {
   if (IsDelim(token, "+")) return CombinatorType::ADJACENT_SIBLING;
   if (IsDelim(token, "~")) return CombinatorType::GENERAL_SIBLING;
   // CombinatorTypeForToken is only ever called if the token has one of these
-  // delimiters, so reaching this point is impossible.
+  // delimitors, so reaching this point is impossible.
   CHECK(false) << absl::StrCat(
       "not a combinator token - type=", TokenType::Code_Name(token.Type()),
       " value=", token.StringValue());
diff --git a/validator/cpp/htmlparser/css/parse-css.h b/validator/cpp/htmlparser/css/parse-css.h
index 2f802e214558..d1612a0c4258 100644
--- a/validator/cpp/htmlparser/css/parse-css.h
+++ b/validator/cpp/htmlparser/css/parse-css.h
@@ -741,7 +741,7 @@ class Selector : public Token {
   virtual void Accept(SelectorVisitor* visitor) const = 0;
 };
 
-// This node models type selectors and universal selectors.
+// This node models type selectors and universial selectors.
 // http://www.w3.org/TR/css3-selectors/#type-selectors
 // http://www.w3.org/TR/css3-selectors/#universal-selector
 class TypeSelector : public Selector {
diff --git a/validator/cpp/htmlparser/data/CaseFolding.txt b/validator/cpp/htmlparser/data/CaseFolding.txt
index 932ace29e6d4..65aa0fcd6b32 100644
--- a/validator/cpp/htmlparser/data/CaseFolding.txt
+++ b/validator/cpp/htmlparser/data/CaseFolding.txt
@@ -1,11 +1,11 @@
-# CaseFolding-14.0.0.txt
-# Date: 2021-03-08, 19:35:41 GMT
-# © 2021 Unicode®, Inc.
+# CaseFolding-15.0.0.txt
+# Date: 2022-02-02, 23:35:35 GMT
+# © 2022 Unicode®, Inc.
 # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
-# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For terms of use, see https://www.unicode.org/terms_of_use.html
 #
 # Unicode Character Database
-#   For documentation, see http://www.unicode.org/reports/tr44/
+#   For documentation, see https://www.unicode.org/reports/tr44/
 #
 # Case Folding Properties
 #
diff --git a/validator/cpp/htmlparser/data/amptags.txt b/validator/cpp/htmlparser/data/amptags.txt
index b5ac9625fa0f..032b0b90ad9f 100644
--- a/validator/cpp/htmlparser/data/amptags.txt
+++ b/validator/cpp/htmlparser/data/amptags.txt
@@ -93,7 +93,6 @@ amp-script
 amp-selector
 amp-sidebar
 amp-skimlinks
-amp-slikeplayer
 amp-smartlinks
 amp-social-share
 amp-soundcloud
diff --git a/validator/cpp/htmlparser/data/jsongrammar.txt b/validator/cpp/htmlparser/data/jsongrammar.txt
index db0cd32d8291..d79b3eb8cd4e 100644
--- a/validator/cpp/htmlparser/data/jsongrammar.txt
+++ b/validator/cpp/htmlparser/data/jsongrammar.txt
@@ -128,7 +128,7 @@ DICT_KEY_BEGIN_QUOTE "\t\n\r " DICT_KEY_BEGIN_QUOTE;
 DICT "\t\n\r " DICT;
 DICT '}' POP DICT_END;
 DICT_KEY ':' DICT_VALUE DICT_KEY_END;
-# Ignore whitespace between key and colon.
+# Ignore whitespace betwen key and colon.
 DICT_KEY "\t\n\r " DICT_KEY;
 DICT_VALUE '"' STRING|PUSH|DICT_END_OR_SEPARATOR STRING_T;
 DICT_VALUE 't' TRUE_1|PUSH|DICT_END_OR_SEPARATOR TRUE_T;
diff --git a/validator/cpp/htmlparser/fileutil.cc b/validator/cpp/htmlparser/fileutil.cc
index a59ca2aa3cb2..58b5834bf9b1 100644
--- a/validator/cpp/htmlparser/fileutil.cc
+++ b/validator/cpp/htmlparser/fileutil.cc
@@ -18,7 +18,7 @@
 // effects is minimized because this is a cc file not an h file. In fact, the
 // WebAssembly module never calls the glob function, so it is better to split
 // fileutil.cc into two files, and the WebAssembly module will only depend on
-// the one without glob functions.
+// the one without glob funcitons.
 #ifndef GLOB_TILDE
 #define GLOB_TILDE (1 << 12)
 #endif
diff --git a/validator/cpp/htmlparser/grammar/tablebuilder.h b/validator/cpp/htmlparser/grammar/tablebuilder.h
index 4052ac78f263..f62e22f95007 100644
--- a/validator/cpp/htmlparser/grammar/tablebuilder.h
+++ b/validator/cpp/htmlparser/grammar/tablebuilder.h
@@ -7,7 +7,7 @@
 // class must be thoroughly tested.
 //
 // Builds a state table by reading grammar file that contains rules for parsing
-// a basic (limited), context free, unambiguous grammar.
+// a basic (limited), context free, unambigous grammar.
 //
 // Using TableBuilder one can generate parser states by writing rules in a
 // text file. See htmlparser/data/jsongrammar.txt.
@@ -15,7 +15,7 @@
 // Grammar text file contains rules which lists states and its transition
 // from one state to another as parser reads input characters. The parse
 // table is pushdown automation that uses stack to push and pop parsing
-// states. Unlike LR parsers there is no shift at each stage of parsing.
+// states. Unline LR parsers there is no shift at each stage of parsing.
 
 // See grammar.txt tutorial for learning grammar syntax.
 // TODO: Add grammar tutorial.
diff --git a/validator/cpp/htmlparser/json/types.h b/validator/cpp/htmlparser/json/types.h
index 5ace320b1131..aa54f9a110c4 100644
--- a/validator/cpp/htmlparser/json/types.h
+++ b/validator/cpp/htmlparser/json/types.h
@@ -1,7 +1,7 @@
 // Declares types in json spec (http://www.json.org):
-// JsonArray: List of heterogenous types. [1, true, "foo",...]
+// JsonArray: List of hetrogenous types. [1, true, "foo",...]
 //
-// JsonDict = Key value pairs of heterogenous values, key is always std::string.
+// JsonDict = Key value pairs of hetrogenous values, key is always std::string.
 // {"foo": "bar", "count": 1,...}
 //
 // JsonObject = Encapsulates any type:
diff --git a/validator/cpp/htmlparser/node.h b/validator/cpp/htmlparser/node.h
index a9c001bd939b..31b8c9482ce4 100644
--- a/validator/cpp/htmlparser/node.h
+++ b/validator/cpp/htmlparser/node.h
@@ -61,7 +61,7 @@ class Node {
   // This does not change order or parent/child relationship of this or child
   // nodes in the tree.
   // Generally, treat this as a private function. Part of public interface for
-  // some specific scenarios:
+  // some specific sceanrios:
   // A) Unit testing.
   // B) When parsing a fragment.
   // C) Custom error/warning reporting.
diff --git a/validator/cpp/htmlparser/parser.cc b/validator/cpp/htmlparser/parser.cc
index 31b213a82e67..1ddbd303054d 100644
--- a/validator/cpp/htmlparser/parser.cc
+++ b/validator/cpp/htmlparser/parser.cc
@@ -396,7 +396,7 @@ void Parser::AddText(const std::string& text) {
 
   text_node->data_.assign(text, 0, text.size());
   AddChild(text_node);
-  // Count number of terms in the text node, except if this is <script>,
+  // Count number of terms in ths text node, except if this is <script>,
   // <textarea> or a comment node.
   if (count_num_terms_in_text_node_ && text_node->Parent() &&
       text_node->Parent()->DataAtom() != Atom::SCRIPT &&
@@ -1140,7 +1140,7 @@ bool Parser::InBodyIM() {  // NOLINT
       ReconstructActiveFormattingElements();
       AddText(d);
       if (frameset_ok_ && !Strings::IsAllWhitespaceChars(d)) {
-        // There were non-whitespace characters inserted.
+        // There were non-whitespace chracters inserted.
         frameset_ok_ = false;
       }
       break;
diff --git a/validator/cpp/htmlparser/parser_test.cc b/validator/cpp/htmlparser/parser_test.cc
index c001b1133443..4a5a74de57ed 100644
--- a/validator/cpp/htmlparser/parser_test.cc
+++ b/validator/cpp/htmlparser/parser_test.cc
@@ -309,7 +309,7 @@ TEST(ParserTest, LineBreakAtPeekableCharacter) {
 
 // Tests duplicate body tags are ignored but their attributes copied to original
 // body tag.
-TEST(ParserTest, SubsequentlyBodyTagAttributesCopied) {
+TEST(ParserTest, SubsequentyBodyTagAttributesCopied) {
   std::string html = ("<html>\n<body id=\"bdy\">\n<div>hello</div></body>"s
                       "<body class=\"bd-cls\"><div>world</div></body></html>");
   htmlparser::Parser parser(html);
diff --git a/validator/cpp/htmlparser/strings.cc b/validator/cpp/htmlparser/strings.cc
index 44ee387d3b72..999ab6622912 100644
--- a/validator/cpp/htmlparser/strings.cc
+++ b/validator/cpp/htmlparser/strings.cc
@@ -826,7 +826,7 @@ std::pair<int, int> UnescapeEntity(std::string* b, int dst, int src,
       // Replace characters from Windows-1252 with UTF-8 equivalents.
       x = kReplacementTable[x - 0x80];
     } else if (x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF) {
-      // Replace invalid characters with the replacement character.
+      // Replace invalid characters with the replacement chracter.
       x = L'\uFFFD';
     }
 
@@ -839,7 +839,7 @@ std::pair<int, int> UnescapeEntity(std::string* b, int dst, int src,
     }
   }
 
-  // Consume the maximum number of characters possible, with the consumed
+  // Consume the maximum number of chracters possible, with the consumed
   // characters matching one of the named references.
   while (i < s.size()) {
     auto c = s.at(i);
@@ -953,7 +953,7 @@ bool ExtractChars(std::string_view str, std::vector<char32_t>* chars) {
   while (!str.empty()) {
     uint8_t c = str.front() & 0xff;
 
-    // ASCII characters first.
+    // ASCII chracters first.
     if (IsOneByteASCIIChar(c)) {
       chars->push_back(c);
       str.remove_prefix(1);
diff --git a/validator/cpp/htmlparser/strings.h b/validator/cpp/htmlparser/strings.h
index 52b837a73d6b..67ef42e54d69 100644
--- a/validator/cpp/htmlparser/strings.h
+++ b/validator/cpp/htmlparser/strings.h
@@ -132,7 +132,7 @@ class Strings {
     std::vector<char32_t> out;
     out.reserve(utf8.size() / 2);
     // We use the UnicodeText abstraction because it handles
-    // validation / conversion under the hood, so what comes out of this is
+    // validation / coersion  under the hood, so what comes out of this is
     // surely valid UTF8.
     auto codepoint = DecodeUtf8Symbol(&utf8);
     while (codepoint) {
@@ -194,7 +194,7 @@ class Strings {
   static void ToLower(std::string* s);
   static void ToUpper(std::string* s);
 
-  // Checks if string contains whitespace only characters.
+  // Checks if string contains whitespace only chracters.
   static bool IsAllWhitespaceChars(std::string_view s,
       std::string_view whitespace_chars = kWhitespace);
 
diff --git a/validator/cpp/htmlparser/strings_test.cc b/validator/cpp/htmlparser/strings_test.cc
index ec34074dfc7b..2ae273c41c4b 100644
--- a/validator/cpp/htmlparser/strings_test.cc
+++ b/validator/cpp/htmlparser/strings_test.cc
@@ -493,7 +493,7 @@ TEST(StringsTest, TranslateTest) {
   // Translate failed.
   EXPECT_FALSE(t8.has_value());
 
-  // Translate ignore characters if abc is longer than xyz.
+  // Translate ignore chracters if abc is longer than xyz.
   // In the following abc string ...wn is removed from translated string.
   auto t9 = htmlparser::Strings::Translate(
       "The quick brown fox.", "brown", "red");
diff --git a/validator/cpp/htmlparser/url.cc b/validator/cpp/htmlparser/url.cc
index 1352e45cc14f..98c2ad8d01d8 100644
--- a/validator/cpp/htmlparser/url.cc
+++ b/validator/cpp/htmlparser/url.cc
@@ -164,7 +164,8 @@ void URL::ParseAuthority() {
 
   bool is_ipv6_literal = false;
   std::string_view host = url_.substr(host_begin, host_end - host_begin);
-  if (host.front() == '[' && host.back() == ']' && host.size() > 2 /* [] */) {
+  if (!host.empty() && host.front() == '[' && host.back() == ']' &&
+      host.size() > 2 /* [] */) {
     is_ipv6_literal = true;
     host.remove_prefix(1);
     host.remove_suffix(1);
diff --git a/validator/cpp/htmlparser/url.h b/validator/cpp/htmlparser/url.h
index 26129eb87a15..f890b76cf4b7 100644
--- a/validator/cpp/htmlparser/url.h
+++ b/validator/cpp/htmlparser/url.h
@@ -1,4 +1,4 @@
-// Uniform resource locator parsing related static functions.
+// Uniform resouce locator parsing related static functions.
 
 #ifndef CPP_HTMLPARSER_URL_H_
 #define CPP_HTMLPARSER_URL_H_
diff --git a/validator/cpp/htmlparser/url_test.cc b/validator/cpp/htmlparser/url_test.cc
index a05dbca6619d..ba6e7e0df335 100644
--- a/validator/cpp/htmlparser/url_test.cc
+++ b/validator/cpp/htmlparser/url_test.cc
@@ -92,11 +92,11 @@ TEST(URLTest, BasicTests) {
   EXPECT_EQ(port_url.protocol(), "http");
   EXPECT_EQ(port_url.hostname(), "www.google.com");
   EXPECT_EQ(port_url.port(), 8080);
-  EXPECT_EQ(url.path_params_fragment(), "/");
+  EXPECT_EQ(port_url.path_params_fragment(), "/");
 
   URL port_url2("http://www.google.com:0080/foo:8080");
   EXPECT_EQ(port_url2.port(), 80);
-  EXPECT_EQ(url.path_params_fragment(), "/foo:8080");
+  EXPECT_EQ(port_url2.path_params_fragment(), "/foo:8080");
 
   // Invalid port.
   URL invalid_port("http://www.google.com:99999/foo");
@@ -177,7 +177,7 @@ TEST(URLTest, IPv6Urls) {
   EXPECT_TRUE(URL("http://[1:2:3:4:5:6::abc]").is_valid());
   EXPECT_TRUE(URL("http://[1:2:3:4:5:6::abcd]").is_valid());
 
-  // Illegal characters.
+  // Illegal characeters.
   EXPECT_FALSE(URL("http://[abcd:123_:abcd::]").is_valid());
   EXPECT_FALSE(URL("http://[abcd:123O:abcd::]").is_valid());
   // Two compression not valid.
diff --git a/validator/cpp/htmlparser/validators/ipaddress_test.cc b/validator/cpp/htmlparser/validators/ipaddress_test.cc
index 2dbaad61996f..fc3e9584ab02 100644
--- a/validator/cpp/htmlparser/validators/ipaddress_test.cc
+++ b/validator/cpp/htmlparser/validators/ipaddress_test.cc
@@ -112,7 +112,7 @@ TEST(ParserTest, InValidURLs) {
   EXPECT_EQ(V("[xyz::abcd:0a]").second.second, 2 /* x */);
   // More than 4 bytes.
   EXPECT_FALSE(V("[abcd:1234:abcde::]").first);
-  // Illegal characters.
+  // Illegal characeters.
   EXPECT_FALSE(V("[abcd:1234:abcd#::]").first);
   EXPECT_FALSE(V("[abcd:123_:abcd::]").first);
   EXPECT_FALSE(V("[abcd:123O:abcd::]").first);
diff --git a/validator/cpp/htmlparser/validators/json_test.cc b/validator/cpp/htmlparser/validators/json_test.cc
index a610a4fdb6e2..5b9f37adc04d 100644
--- a/validator/cpp/htmlparser/validators/json_test.cc
+++ b/validator/cpp/htmlparser/validators/json_test.cc
@@ -102,7 +102,7 @@ TEST(ParserTest, ValidJson) {
   EXPECT_TRUE(V("[\"amaltas\", \"seol\", \"hello\", \"ok\\uA93Cbye\"]").first);
   EXPECT_TRUE(V("[true, false, false, true, true, null]").first);
   EXPECT_TRUE(V("[1.0, 2.0, 3.0, 4.0e+3, 5.12e-3, 6.40e3, 0.1, -0.1]").first);
-  // Basic with different type values.
+  // Basic with diffrent type values.
   EXPECT_TRUE(V("[1, true, \"amaltas\", 1.0, null, \"world\", false]").first);
   // Array inside array.
   EXPECT_TRUE(V("[1, [2, [3, [4, 5]]], true, false, [\"a\", \"b\"]]").first);
diff --git a/validator/cpp/htmlparser/validators/supported_media_query_test.cc b/validator/cpp/htmlparser/validators/supported_media_query_test.cc
index 7e1023ce57be..26d0b4fb044f 100644
--- a/validator/cpp/htmlparser/validators/supported_media_query_test.cc
+++ b/validator/cpp/htmlparser/validators/supported_media_query_test.cc
@@ -44,7 +44,7 @@ TEST(SupportedMediaQueryParserTest, InValidQuerySingleExpression) {
   EXPECT_EQ(r.second.second, 13 /* min is invalid expecting min-width etc. */);
 
   r = V("all and (min-width: 200px and max-width: 200px)");
-  // ------------------------------^  only single expression inside parenthesis.
+  // ------------------------------^  only single expression inside paranthesis.
   EXPECT_FALSE(r.first);
   EXPECT_EQ(r.second.second, 27 /* invalid second 'and' operator */);