diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php
index 43137d386275f..ad869d0415bf0 100644
--- a/src/wp-includes/html-api/class-wp-html-tag-processor.php
+++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php
@@ -1664,6 +1664,24 @@ private function parse_next_tag() {
$this->text_starts_at = $this->token_starts_at + 2;
$this->text_length = $closer_at - $this->text_starts_at;
$this->bytes_already_parsed = $closer_at + 1;
+
+ // Identify nodes that would be CDATA if HTML had CDATA sections.
+ if (
+ $this->token_length >= 10 &&
+ '[' === $html[ $this->token_starts_at + 2 ] &&
+ 'C' === $html[ $this->token_starts_at + 3 ] &&
+ 'D' === $html[ $this->token_starts_at + 4 ] &&
+ 'A' === $html[ $this->token_starts_at + 5 ] &&
+ 'T' === $html[ $this->token_starts_at + 6 ] &&
+ 'A' === $html[ $this->token_starts_at + 7 ] &&
+ '[' === $html[ $this->token_starts_at + 8 ] &&
+ ']' === $html[ $closer_at - 1 ]
+ ) {
+ $this->parser_state = self::STATE_CDATA_NODE;
+ $this->text_starts_at += 7;
+ $this->text_length -= 9;
+ }
+
return true;
}
@@ -1700,6 +1718,41 @@ private function parse_next_tag() {
$this->text_starts_at = $this->token_starts_at + 2;
$this->text_length = $closer_at - $this->text_starts_at;
$this->bytes_already_parsed = $closer_at + 1;
+
+ /*
+ * Identify a Processing Instruction node were HTML to have them.
+ *
+ * XML allows for more target names, but this code only identifies
+ * a subset. This is more or less okay because ultimately these are
+ * HTML comments in the DOM and this safely supports _some_ kinds
+ * of PI Nodes without getting lost while parsing.
+ *
+ * This code identifies processing instruction nodes whose target
+ * name can be represented in single-byte UTF-8 / 7-bit ASCII.
+ *
+ * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
+ * [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
+ * [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
+ * [#x10000-#xEFFFF]
+ * > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
+ *
+ * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
+ */
+ if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) {
+ $comment_text = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 );
+ $pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' );
+
+ if ( 0 < $pi_target_length ) {
+ $pi_target_length += strspn( $comment_text,'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length );
+
+ $this->parser_state = self::STATE_PI_NODE;
+ $this->tag_name_starts_at = $this->token_starts_at + 2;
+ $this->tag_name_length = $pi_target_length;
+ $this->text_starts_at += $pi_target_length;
+ $this->text_length -= $pi_target_length + 1;
+ }
+ }
+
return true;
}
@@ -2507,6 +2560,9 @@ public function get_token_type() {
case self::STATE_DOCTYPE:
return '#doctype';
+ case self::STATE_PI_NODE:
+ return '#processing-instruction';
+
default:
return $this->get_token_name();
}
@@ -2540,6 +2596,12 @@ public function get_token_name() {
case self::STATE_TEXT_NODE:
return '#text';
+ case self::STATE_CDATA_NODE:
+ return '#cdata-section';
+
+ case self::STATE_PI_NODE:
+ return substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length );
+
case self::STATE_COMMENT:
return '#comment';
@@ -2576,7 +2638,15 @@ public function get_modifiable_text() {
$at = $this->text_starts_at;
$length = $this->text_length;
$text = substr( $this->html, $at, $length );
- $text = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );
+
+ if (
+ self::STATE_CDATA_NODE === $this->parser_state ||
+ self::STATE_PI_NODE === $this->parser_state
+ ) {
+ return $text;
+ }
+
+ $text = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );
if ( empty( $text ) ) {
return '';
@@ -3131,6 +3201,38 @@ private function matches() {
*/
const STATE_TEXT_NODE = 'STATE_TEXT_NODE';
+ /**
+ * Parser CDATA Node State.
+ *
+ * Indicates that the parser has found a CDADA node and it's possible
+ * to read and modify its modifiable text. Note that in HTML there are
+ * no CDATA nodes outside foreign elements (SVG and MathML). Outside
+ * of foreign elements, they are treated as HTML comments. Nonetheless,
+ * the Tag Processor still recognizes them as they appear in the HTML
+ * stream and exposes them for inspection and modification.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_CDATA_NODE = 'STATE_CDATA_NODE';
+
+ /**
+ * Parser Processing Instruction State.
+ *
+ * Indicates that the parser has found a Processing Instruction and
+ * it's possible to read and modify its modifiable text. Note that in
+ * HTML there are no Processing Instruction nodes and they are treated
+ * as HTML comments. Nonetheless, the Tag Processor still recognizes
+ * them as they appear in the HTML stream and exposes them for
+ * inspection and modification.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_PI_NODE = 'STATE_PI_NODE';
+
/**
* Indicates that the parser has found an HTML comment and it's
* possible to read and modify its modifiable text.