Sync for validator cpp engine and cpp htmlparser (#39797)

* Migrate from `cfg = "host"` to `cfg = "exec"` PiperOrigin-RevId: 488423550 * ...Fix risky undefined behavior when host is empty.... PiperOrigin-RevId: 488505778 * Allow ampjs domain for serving amp js files. PiperOrigin-RevId: 489206825 * No description. PiperOrigin-RevId: 491684683 * Explicitly cast a char32_t to uint32_t before streaming it to output. In C++17, the char32_t is silently formatted as a number, but in C++20, overloads of << accepting wide character types are deleted, breaking compilation of this code. The value also was formatted in decimal afaict, but all other values here are formatted in hex, so do that here as well. PiperOrigin-RevId: 499523425 * Fix AMP for Email spec URLs Some of these URLs are pointing to the AMP for Website documentation. PiperOrigin-RevId: 508406092 * Replace greggrothaus (now a xoogler) with Erwin in OWNERs file. Edited README to: - Move htmlparser out of beta. It is pretty stable and running in several production services since 2019. - Remove references of being maintained by AMP working group, as it will be maintained by ex-engineers of working group. PiperOrigin-RevId: 518921324 * Internal Code Change PiperOrigin-RevId: 540320460 * Internal Code Change PiperOrigin-RevId: 540323049 * Update README.md --------- Co-authored-by: Googler <[email protected]> Co-authored-by: Amaltas Bohra <[email protected]>
ampproject · Feb 5, 2024 · 6e751a4 · 6e751a4
1 parent 4fdf6a7
commit 6e751a4
Show file tree

Hide file tree

Showing 34 changed files with 84 additions and 103 deletions.
diff --git a/validator/cpp/engine/embed_data.bzl b/validator/cpp/engine/embed_data.bzl
@@ -39,7 +39,7 @@ embed_data = rule(
         ),
         "header_generator": attr.label(
             executable = True,
-            cfg = "host",
+            cfg = "exec",
             allow_files = True,
             default = Label(
                 "//cpp/engine/scripts:filecontents_to_cpp_header",

diff --git a/validator/cpp/engine/parse-layout-sizes.h b/validator/cpp/engine/parse-layout-sizes.h
@@ -9,7 +9,7 @@ namespace amp::validator::parse_layout_sizes {
 
 // WARNING: This code is still in development and not ready to be used.
 
-// This is a single representation for the CssSizes object.
+// This is a single represenation for the CssSizes object.
 // It consists of at least a valid size and a possible media condition.
 // See https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#Attributes
 struct CssSize {

diff --git a/validator/cpp/engine/parse-srcset_test.cc b/validator/cpp/engine/parse-srcset_test.cc
@@ -150,7 +150,7 @@ TEST(ParseSrcsetTest, LeadingAndTrailingCommasAndCommasInUrl) {
               EqCandidates({{"example.com/,/,/,/,50w", "1x"}}));
 }
 
-TEST(ParseSrcsetTest, NoWhitespace) {
+TEST(ParseSrcsetTest, NoWhitepsace) {
   SrcsetParsingResult result = ParseSourceSet("image 100w,image 50w");
   EXPECT_TRUE(result.success);
   EXPECT_THAT(result.srcset_images,

diff --git a/validator/cpp/engine/testing-utils.cc b/validator/cpp/engine/testing-utils.cc
@@ -97,8 +97,6 @@ const std::map<std::string, TestCase>& TestCases() {
            "external/amphtml-extensions/*/*/test/*.html",
             &html_files)) << "Test cases file pattern not found.";
 
-    CHECK(!html_files.empty()) << "Validator test cases are empty. Will not proceed.";
-
     std::sort(html_files.begin(), html_files.end());
     for (const std::string& html_file : html_files) {
       if (html_file.find("/js_only/") != std::string::npos) continue;

diff --git a/validator/cpp/engine/utf8-util_test.cc b/validator/cpp/engine/utf8-util_test.cc
@@ -14,7 +14,7 @@ TEST(Utf8UtilTest, Utf16StrLen) {
   // It's 34 bytes long and 22 utf-8 characters long. Javascript uses UTF16
   // strings and string lengths.
   // The chars in Iñtërnâtiônàlizætiøn vary between 1 and 2 byte lengths, all
-  // javascript 1-char lengths. The ⚡ is a 3-byte length character, with a
+  // javascript 1-char lenghts. The ⚡ is a 3-byte length character, with a
   // 1-char javascript length. Finally the 💩 is a 4-byte length character with
   // a 2-char javascript length.
   EXPECT_EQ(Utf16StrLen("Iñtërnâtiônàlizætiøn☃💩"), 23);

diff --git a/validator/cpp/engine/validator-internal.cc b/validator/cpp/engine/validator-internal.cc
@@ -644,9 +644,9 @@ struct ParsedReferencePoint {
 class ParsedReferencePoints {
  public:
   ParsedReferencePoints() : parent_(nullptr) {}
-  ParsedReferencePoints(
-      const TagSpec& parent,
-      const unordered_map<std::string, int32_t>& tag_spec_ids_by_tag_spec_name)
+  ParsedReferencePoints(const TagSpec& parent,
+                        const absl::flat_hash_map<std::string, int32_t>&
+                            tag_spec_ids_by_tag_spec_name)
       : parent_(&parent) {
     for (const ReferencePoint& p : parent.reference_points()) {
       auto iter = tag_spec_ids_by_tag_spec_name.find(p.tag_spec_name());
@@ -1041,11 +1041,11 @@ RecordValidated ShouldRecordTagspecValidated(
 // which is unique within its context, the ParsedValidatorRules.
 class ParsedTagSpec {
  public:
-  ParsedTagSpec(
-      ParsedAttrSpecs* parsed_attr_specs,
-      const unordered_map<std::string, int32_t>& tag_spec_ids_by_tag_spec_name,
-      RecordValidated should_record_tagspec_validated, const TagSpec* spec,
-      int32_t id)
+  ParsedTagSpec(ParsedAttrSpecs* parsed_attr_specs,
+                const absl::flat_hash_map<std::string, int32_t>&
+                    tag_spec_ids_by_tag_spec_name,
+                RecordValidated should_record_tagspec_validated,
+                const TagSpec* spec, int32_t id)
       : spec_(spec),
         id_(id),
         reference_points_(*spec, tag_spec_ids_by_tag_spec_name),
@@ -1192,7 +1192,7 @@ class ParsedTagSpec {
 
   // Whether or not the tag should be recorded via
   // Context->RecordTagspecValidated if it was validated
-  // successfully. For performance, this is only done for tags that
+  // successfullly. For performance, this is only done for tags that
   // are mandatory, unique, or possibly required by some other tag.
   RecordValidated ShouldRecordTagspecValidated() const {
     return should_record_tagspec_validated_;
@@ -1218,7 +1218,7 @@ class ParsedTagSpec {
 
   const set<int32_t>& implicit_attrspecs() const { return implicit_attrspecs_; }
 
-  const unordered_map<std::string, int32_t>& attr_ids_by_name() const {
+  const absl::flat_hash_map<std::string, int32_t>& attr_ids_by_name() const {
     return attr_ids_by_name_;
   }
 
@@ -1240,7 +1240,7 @@ class ParsedTagSpec {
   bool is_reference_point_;
   bool is_type_json_ = false;
   bool contains_url_ = false;
-  unordered_map<std::string, int32_t> attr_ids_by_name_;
+  absl::flat_hash_map<std::string, int32_t> attr_ids_by_name_;
   vector<TypeIdentifier> disabled_by_;
   vector<TypeIdentifier> enabled_by_;
   vector<int32_t> mandatory_attr_ids_;
@@ -1284,7 +1284,7 @@ std::string TagSpecUrl(const TagSpec& spec) {
     return StrCat(extension_spec_url_prefix, spec.extension_spec().name());
   if (spec.requires_extension_size() > 0)
     // Return the first |requires_extension|, which should be the most
-    // representative.
+    // representitive.
     return StrCat(extension_spec_url_prefix, spec.requires_extension(0));
 
   return "";
@@ -2476,7 +2476,7 @@ class Context {
     if (!tag_result.best_match_tag_spec) return;
     const ParsedTagSpec* parsed_tag_spec = tag_result.best_match_tag_spec;
     if (!parsed_tag_spec->AttrsCanSatisfyExtension()) return;
-    const unordered_map<std::string, int32_t>& attr_ids_by_name =
+    const absl::flat_hash_map<std::string, int32_t>& attr_ids_by_name =
         parsed_tag_spec->attr_ids_by_name();
     ExtensionsContext* extensions_ctx = mutable_extensions();
     for (const ParsedHtmlTagAttr& attr : encountered_tag.Attributes()) {
@@ -2834,11 +2834,11 @@ class InvalidRuleVisitor : public htmlparser::css::RuleVisitor {
 class InvalidDeclVisitor : public htmlparser::css::RuleVisitor {
  public:
   InvalidDeclVisitor(const ParsedDocCssSpec& css_spec, Context* context,
-                     const std::string& tag_descriptive_name,
+                     const std::string& tag_decriptive_name,
                      ValidationResult* result)
       : css_spec_(css_spec),
         context_(context),
-        tag_descriptive_name_(tag_descriptive_name),
+        tag_descriptive_name_(tag_decriptive_name),
         result_(result) {}
 
   void VisitDeclaration(
@@ -4412,7 +4412,7 @@ void ValidateAttributes(const ParsedTagSpec& parsed_tag_spec,
   set<std::string_view> mandatory_anyofs_seen;
   vector<const ParsedAttrTriggerSpec*> parsed_trigger_specs;
   set<int32_t> attrspecs_validated;
-  const unordered_map<std::string, int32_t>& attr_ids_by_name =
+  const absl::flat_hash_map<std::string, int32_t>& attr_ids_by_name =
       parsed_tag_spec.attr_ids_by_name();
 
   for (const ParsedHtmlTagAttr& attr : encountered_tag.Attributes()) {
@@ -4717,7 +4717,7 @@ ParsedValidatorRules::ParsedValidatorRules(HtmlFormat::Code html_format)
   // |tag_spec_names_to_track| to identify those tagspecs that are
   // referenced by others via "also_requires_tag".  The ParsedTagSpec
   // constructor completes this translation to ids.
-  unordered_map<std::string, int32_t> tag_spec_ids_by_tag_spec_name;
+  absl::flat_hash_map<std::string, int32_t> tag_spec_ids_by_tag_spec_name;
   unordered_set<std::string> tag_spec_names_to_track;
   for (int ii = 0; ii < rules_.tags_size(); ++ii) {
     const TagSpec& tag = rules_.tags(ii);
@@ -5622,7 +5622,7 @@ void ReferencePointMatcher::RecordMatch(const ParsedTagSpec& reference_point) {
 
 void ReferencePointMatcher::ExitParentTag(const Context& context,
                                           ValidationResult* result) const {
-  absl::node_hash_map<int32_t, int32_t> reference_point_by_count;
+  absl::flat_hash_map<int32_t, int32_t> reference_point_by_count;
   for (int32_t r : reference_points_matched_) ++reference_point_by_count[r];
   for (const ParsedReferencePoint& p : *parsed_reference_points_) {
     if (p.point->mandatory() && reference_point_by_count.find(p.tag_spec_id) ==

diff --git a/validator/cpp/engine/validator_test.cc b/validator/cpp/engine/validator_test.cc
@@ -481,8 +481,8 @@ TEST(ValidatorTest, TestCssLengthAmpEmail) {
         ":13:2 The author stylesheet specified in tag 'style amp-custom' "
         "is too long - document contains 75001 bytes whereas the "
         "limit is 75000 "
-        "bytes. (see https://amp.dev/documentation/guides-and-tutorials/email/learn/"
-        "spec/amphtml#maximum-size)");
+        "bytes. (see https://amp.dev/documentation/guides-and-tutorials/email/"
+        "learn/spec/amphtml#maximum-size)");
     EXPECT_EQ(expected_output, output) << "test case " << test_case_name;
   }
 
@@ -527,7 +527,8 @@ TEST(ValidatorTest, TestCssLengthAmpEmail) {
         ":19:6 The author stylesheet specified in tag 'style amp-custom' "
         "and the combined inline styles is too large - document contains 75010 "
         "bytes whereas the limit is 75000 bytes. (see https://amp.dev/"
-        "documentation/guides-and-tutorials/email/learn/spec/amphtml#maximum-size)");
+        "documentation/guides-and-tutorials/email/learn/spec/amphtml"
+        "#maximum-size)");
     EXPECT_EQ(expected_output, output) << "test case " << test_case_name;
   }
 
@@ -555,7 +556,8 @@ TEST(ValidatorTest, TestCssLengthAmpEmail) {
         ":7519:6 The author stylesheet specified in tag 'style amp-custom' "
         "and the combined inline styles is too large - document contains 75014 "
         "bytes whereas the limit is 75000 bytes. (see https://amp.dev/"
-        "documentation/guides-and-tutorials/email/learn/spec/amphtml#maximum-size)");
+        "documentation/guides-and-tutorials/email/learn/spec/amphtml"
+        "#maximum-size)");
     EXPECT_EQ(expected_output, output) << "test case " << test_case_name;
   }
 
@@ -649,8 +651,8 @@ TEST(ValidatorTest, TestCssLengthAmpEmailStrict) {
         ":13:2 The author stylesheet specified in tag 'style amp-custom' "
         "is too long - document contains 75001 bytes whereas the "
         "limit is 75000 "
-        "bytes. (see https://amp.dev/documentation/guides-and-tutorials/email/learn/"
-        "spec/amphtml#maximum-size)");
+        "bytes. (see https://amp.dev/documentation/guides-and-tutorials/email/"
+        "learn/spec/amphtml#maximum-size)");
     EXPECT_EQ(expected_output, output) << "test case " << test_case_name;
   }
 
@@ -681,7 +683,8 @@ TEST(ValidatorTest, TestCssLengthAmpEmailStrict) {
         ":19:6 The author stylesheet specified in tag 'style amp-custom' "
         "and the combined inline styles is too large - document contains 75010 "
         "bytes whereas the limit is 75000 bytes. (see https://amp.dev/"
-        "documentation/guides-and-tutorials/email/learn/spec/amphtml#maximum-size)");
+        "documentation/guides-and-tutorials/email/learn/spec/amphtml"
+        "#maximum-size)");
     EXPECT_EQ(expected_output, output) << "test case " << test_case_name;
   }
 
@@ -701,7 +704,8 @@ TEST(ValidatorTest, TestCssLengthAmpEmailStrict) {
         ":3769:6 The author stylesheet specified in tag 'style amp-custom' "
         "and the combined inline styles is too large - document contains 75014 "
         "bytes whereas the limit is 75000 bytes. (see https://amp.dev/"
-        "documentation/guides-and-tutorials/email/learn/spec/amphtml#maximum-size)");
+        "documentation/guides-and-tutorials/email/learn/spec/amphtml"
+        "#maximum-size)");
     EXPECT_EQ(expected_output, output) << "test case " << test_case_name;
   }
 
@@ -730,12 +734,12 @@ TEST(ValidatorTest, TestCssLengthAmpEmailStrict) {
     std::string output = RenderResult(
         /*filename=*/test_case_name,
         amp::validator::Validate(test_html, HtmlFormat::AMP4EMAIL));
-    std::string expected_output =
-        StrCat("FAIL\n", test_case_name,
-               ":17:2 The inline style specified in tag 'div' is too long - it "
-               "contains 1001 bytes whereas the limit is 1000 bytes. (see "
-               "https://amp.dev/documentation/guides-and-tutorials/email/learn/spec/"
-               "amphtml#maximum-size)");
+    std::string expected_output = StrCat(
+        "FAIL\n", test_case_name,
+        ":17:2 The inline style specified in tag 'div' is too long - it "
+        "contains 1001 bytes whereas the limit is 1000 bytes. (see "
+        "https://amp.dev/documentation/guides-and-tutorials/email/learn/spec/"
+        "amphtml#maximum-size)");
     EXPECT_EQ(expected_output, output) << "test case " << test_case_name;
   }
 }

diff --git a/validator/cpp/engine/wasm/BUILD b/validator/cpp/engine/wasm/BUILD
@@ -1,7 +1,6 @@
 # Wraps AMP Validator into a WebAssembly library,
 # which can be used by javascript files.
 
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@emsdk//emscripten_toolchain:wasm_rules.bzl", "wasm_cc_binary")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_js_binary", "closure_js_library")
 
@@ -73,8 +72,3 @@ closure_js_binary(
         ":validator_js_lib",
     ],
 )
-
-build_test(
-    name = "validator_js_test",
-    targets = [":validator_js_bin"],
-)
diff --git a/validator/cpp/engine/wasm/validator.js b/validator/cpp/engine/wasm/validator.js
@@ -83,7 +83,7 @@ function digitizeValidationErrorFields(error) {
 /**
  * When transforming validation errors and validation results from jspb to plain
  * objects, the protobuf base64 string is also attached to the output.
- * Hence when a plain object needs to be transformed back to protobuf,
+ * Hence when a plain object neeeds to be transformed back to protobuf,
  * the attached base64 could be directly used.
  */
 const PB_BASE64 = '_PB_BASE64';

diff --git a/validator/cpp/htmlparser/BUILD b/validator/cpp/htmlparser/BUILD
@@ -71,7 +71,7 @@ cc_test(
 )
 
 # Similar to go lang's defer statement. Defers the execution of statement
-# until in which it is declared goes out of scope.
+# until in which it is decalred goes out of scope.
 cc_library(
     name = "defer",
     hdrs = [
@@ -80,7 +80,7 @@ cc_library(
     copts = ["-std=c++17"],
 )
 
-# Helper library declares various doctype constants and a utility function to
+# Helper library decalres various doctype constants and a utility function to
 # parse doctype string and extract various components in it.
 cc_library(
     name = "doctype",

diff --git a/validator/cpp/htmlparser/README.md b/validator/cpp/htmlparser/README.md
@@ -1,21 +1,6 @@
 # HTML Parser
 
-This is an HTML5 compliant parser written in C++. It was created to be used by
-the
-[AMPHTML Validator](https://github.com/ampproject/amphtml/tree/main/validator)
-to standardize how AMPHTML documents should be parsed for AMP validation.
-
-## Maintainers
-
-This parser is maintained by the [AMP Working Group](https://amp.dev/community/working-groups/amp4email/):
-[Caching](https://amp.dev/community/working-groups/caching/)
-
-## Current Status
-
-This parser is in active development and has several outstanding TODOs.
-These TODOs may cause certain parsing tests to fail. Those tests have been
-excluded until the TODOs are resolved. See htmldataset_test.cc for a list of
-those tests.
+This is an HTML5 compliant parser written in C++.
 
 ## Building and Testing with Bazel
 

diff --git a/validator/cpp/htmlparser/allocator.h b/validator/cpp/htmlparser/allocator.h
@@ -62,7 +62,7 @@
 // is naturally aligned if the address used to identify it has an 8-byte
 // alignment.
 //
-// Following data structure contains members totaling 13 bytes, but it's actual
+// Following data struture contains members totaling 13 bytes, but it's actual
 // size is 24 bytes due to 8 byte alignment.
 //
 // Alignment is always equal to the largest sized element in the structure.
@@ -201,10 +201,10 @@ class Allocator {
   Allocator& operator=(const Allocator&) = delete;
 
   // Allocates memory of same size required to construct object of type T.
-  // Returns nullptr if allocation failed.
+  // Returns nullptr if alloction failed.
   void* Allocate() {
     // Checks if remaining bytes in block are less than object size, or
-    // remaining bytes after alignment is less than object size.
+    // reamining bytes after alignment is less than object size.
     // Add a new block.
     if (object_size_ > remaining_ || !AlignFreeAddress()) {
       if (!NewBlock()) return nullptr;
@@ -338,7 +338,7 @@ class Allocator {
   }
 
   // If the block's address is not aligned, moves the pointer to the address
-  // that is multiple of alignment_.
+  // that is multiple of aligment_.
   bool AlignFreeAddress() {
     // Checks how many bytes to skip to be at the correct alignment.
     if (const std::size_t skip =

diff --git a/validator/cpp/htmlparser/bin/entitytablegen.cc b/validator/cpp/htmlparser/bin/entitytablegen.cc
@@ -139,7 +139,7 @@ int main(int argc, char** argv) {
 
       if ((code_point & 0xffffff80) == 0) {  // 1 byte sequence.
         // 0b0xxxxxx.
-        fd << "\\x" << code_point;
+        fd << "\\x" << std::hex << static_cast<uint32_t>(code_point);
       } else if ((code_point & 0xfffff800) == 0) {  // 2 byte sequence.
         // 0b110xxxxx 0b10xxxxxx.
         fd << "\\x" << std::hex << ((code_point >> 6) | 0xc0)

diff --git a/validator/cpp/htmlparser/css/parse-css-urls.cc b/validator/cpp/htmlparser/css/parse-css-urls.cc
@@ -47,7 +47,7 @@ void Preprocess(vector<char32_t>* codepoints) {
         out.push_back('\n');
         last_codepoint_was_cr = true;
         break;
-      case '\f':  // also known as form feed (FF)
+      case '\f':  // also knwon as form feed (FF)
         out.push_back('\n');
         last_codepoint_was_cr = false;
         break;

diff --git a/validator/cpp/htmlparser/css/parse-css.cc b/validator/cpp/htmlparser/css/parse-css.cc
@@ -85,7 +85,7 @@ const std::string& Token::StringValue() const {
 }
 
 std::string Token::ToString() const {
-  // The following are overridden in their class: AT_KEYWORD, CLOSE_CURLY,
+  // The following are overriden in their class: AT_KEYWORD, CLOSE_CURLY,
   // CLOSE_PAREN, CLOSE_SQUARE, DELIM, DIMENSION, FUNCTION_TOKEN, IDENT,
   // NUMBER, OPEN_CURLY, OPEN_PAREN, OPEN_SQUARE, PERCENTAGE, STRING, URL
   switch (Type()) {
@@ -343,7 +343,7 @@ bool Whitespace(char32_t code) {
 char32_t kMaximumallowedcodepoint = 0x10ffff;
 
 // A MarkedPosition object saves position information from the tokenizer
-// provided as |line| and |col| to the constructor and can later write that
+// rovided as |line| and |col| to the constructor and can later write that
 // position back to a Token object.
 class MarkedPosition {
  public:
@@ -2471,7 +2471,7 @@ CombinatorType::Code CombinatorTypeForToken(const Token& token) {
   if (IsDelim(token, "+")) return CombinatorType::ADJACENT_SIBLING;
   if (IsDelim(token, "~")) return CombinatorType::GENERAL_SIBLING;
   // CombinatorTypeForToken is only ever called if the token has one of these
-  // delimiters, so reaching this point is impossible.
+  // delimitors, so reaching this point is impossible.
   CHECK(false) << absl::StrCat(
       "not a combinator token - type=", TokenType::Code_Name(token.Type()),
       " value=", token.StringValue());

diff --git a/validator/cpp/htmlparser/css/parse-css.h b/validator/cpp/htmlparser/css/parse-css.h
@@ -741,7 +741,7 @@ class Selector : public Token {
   virtual void Accept(SelectorVisitor* visitor) const = 0;
 };
 
-// This node models type selectors and universal selectors.
+// This node models type selectors and universial selectors.
 // http://www.w3.org/TR/css3-selectors/#type-selectors
 // http://www.w3.org/TR/css3-selectors/#universal-selector
 class TypeSelector : public Selector {