adamziel · adamziel · Jun 4, 2024 · Jun 4, 2024 · Jun 4, 2024 · Jun 4, 2024
diff --git a/src/wp-includes/html-api/class-wp-xml-decoder.php b/src/wp-includes/html-api/class-wp-xml-decoder.php
@@ -36,104 +36,64 @@ public static function decode( $text ) {
 			if ( false === $next_character_reference_at || $next_character_reference_at >= $end ) {
 				break;
 			}
-			// $next_character_reference_at += 1;
 
-			/*
-			 * Capture all bytes that could form a character reference.
-			 *
-			 * This only supports:
-			 *
-			 * * The five mandated character references, that is &amp; &lt; &gt; &quot; &apos;
-			 * * Numeric character references, e.g. &#123; or &#x1A;
-			 *
-			 * XML grammar rule for parsing numeric references is:
-			 *
-			 *     [66] CharRef   ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' [WFC: Legal Character]
-			 *
-			 * See https://www.w3.org/TR/xml/#NT-CharRef
-			 */
-			$token_length = strspn(
-				$text,
-				'ampltgquos#xX0123456789bcdefABCDEF;',
-				$next_character_reference_at + 1,
-				/*
-				 * Limit the length of the token to avoid scanning the entire document in case
-				 * a semicolon is missing.
-				 *
-				 * The maximum supported code point is 10FFFF, which is 9 characters long when
-				 * represented as either decimal or hexadecimal numeric character reference entity.
-				 * Technically, you can also add zeros to the front of the entity, which makes the
-				 * string longer, for example &#00000010FFFF;
-				 *
-				 * We limit this scan to 30 characters, which allows twenty zeros at the front.
-				 */
-				30
-			);
+			if ( '#' === $text[ $next_character_reference_at + 1 ] ) {
+				$is_hex          = 'x' === $text[ $next_character_reference_at + 2 ] || 'X' === $text[ … ];
+				$zeros_start_at  = $next_character_reference_at + 3 + ( $is_hex ? 1 : 0 );
+				$zeros_length    = strspn( $text, '0', $zeros_start_at );
+				$digits_start_at = $zeros_start_at + $zeros_length;
+				$digit_chars     = $is_hex ? '0123456789abcdefABCDEF' : '0123456789';
+				$digits_length   = strspn( $text, $digit_chars, $digits_start_at );
+				$semicolon_at    = $digits_start_at + $digits_length;
+
+				// Must be followed by a semicolon.
+				if ( ';' !== $text[ $semicolon_at ] ) {
+					return false;
+				}
 
-			if ( false === $token_length ) {
-				return null;
-			}
+				// Null bytes cannot be encoded in XML.
+				if ( 0 === $digits_length ) {
+					return false;
+				}
 
-			if ( ';' !== $text[ $next_character_reference_at + 1 + $token_length - 1 ] ) {
 				/*
-				 * In XML, all character references must end with a semicolon.
+				 * Must encode a valid Unicode code point.
+				 * (Avoid parsing more than is necessary).
 				 */
-				return null;
+				$max_digits = $is_hex ? 6 : 7;
+				if ( $digits_length > $max_digits ) {
+					return false;
+				}
+
+				$base       = $is_hex ? 16 : 10;
+				$code_point = intval( substr( $text, $digits_start_at, $digits_length ), $base );
+				if ( if_allowable_code_point( $code_point ) ) {
+					$decoded .= WP_HTML_Decoder::code_point_to_utf8_bytes( $code_point );
+					$at       = $semicolon_at + 1;
+					continue;
+				}
+
+				return false;
 			}
 
-			$token = strtolower( substr( $text, $next_character_reference_at + 1, $token_length - 1 ) );
+			// Must be a named character reference.
+			$name_starts_at = $next_character_reference_at + 1;
 
-			if ( 'amp' === $token ) {
-				$character_reference = '&';
-			} elseif ( 'lt' === $token ) {
-				$character_reference = '<';
-			} elseif ( 'gt' === $token ) {
-				$character_reference = '>';
-			} elseif ( 'quot' === $token ) {
-				$character_reference = '"';
-			} elseif ( 'apos' === $token ) {
-				$character_reference = "'";
-			} else {
-				$code_point = self::parse_code_point( $text, $next_character_reference_at );
-				if ( null === $code_point ) {
-					/*
-					 * > The following are forbidden, and constitute fatal errors:
-					 * > * the appearance of a reference to an unparsed entity, except in the EntityValue in an entity declaration.
-					 *
-					 * See https://www.w3.org/TR/xml/#forbidden
-					 */
-					return null;
-				}
-				$character_reference = WP_HTML_Decoder::code_point_to_utf8_bytes( $code_point );
-				if (
-					'�' === $character_reference &&
-					0xFFFD !== $code_point
-				) {
-					/*
-					 * Stop processing if we got an invalid character AND the reference does not
-					 * specifically refer code point FFFD (�).
-					 *
-					 * > It is a fatal error when an XML processor encounters an entity with an
-					 * > encoding that it is unable to process. It is a fatal error if an XML entity
-					 * > is determined (via default, encoding declaration, or higher-level protocol)
-					 * > to be in a certain encoding but contains byte sequences that are not legal
-					 * > in that encoding. Specifically, it is a fatal error if an entity encoded in
-					 * >  UTF-8 contains any ill-formed code unit sequences, as defined in section
-					 * > 3.9 of Unicode [Unicode]. Unless an encoding is determined by a higher-level
-					 * > protocol, it is also a fatal error if an XML entity contains no encoding
-					 * > declaration and its content is not legal UTF-8 or UTF-16.
-					 *
-					 * See https://www.w3.org/TR/xml/#charencoding
-					 */
-					return null;
+			$standard_entities = array(
+				'amp;'  => '&',
+				'apos;' => "'",
+				'gt;'   => '>',
+				'lt;'   => '<',
+				'quot;' => '"',
+			);
+
+			foreach ( $standard_entities as $name => $replacement ) {
+				if ( substr_compare( $text, $name, $name_starts_at, strlen( $name ) ) ) {
+					$decoded .= $replacement;
+					$at       = $name_starts_at + strlen( $name );
+					break;
 				}
 			}
-
-			$at       = $next_character_reference_at;
-			$decoded .= substr( $text, $was_at, $at - $was_at );
-			$decoded .= $character_reference;
-			$at      += $token_length + 1;
-			$was_at   = $at;
 		}
 
 		if ( 0 === $was_at ) {

diff --git a/src/wp-includes/html-api/class-wp-xml-processor.php b/src/wp-includes/html-api/class-wp-xml-processor.php
@@ -50,15 +50,59 @@ class WP_XML_Processor extends WP_XML_Tag_Processor {
 	 */
 	public $stack_of_open_elements = array();
 
+	public static function stream_tokens( $input_stream, $output_stream, $buffer_size = 8092 ) {
+		$streamed_data = fread( $input_stream, $buffer_size );
+
+		$breadcrumbs    = array();
+		$parser_context = WP_XML_Processor::IN_PROLOG_CONTEXT;
+		$processor      = new WP_XML_Processor( $streamed_data, $breadcrumbs, $parser_context );
+		while ( true ) {
+			$token_found = $processor->next_token();
+			$processor->get_updated_xml();
+
+			if ( $processor->paused_at_incomplete_token() ) {
+				fwrite( $output_stream, $processor->get_processed_xml() );
+
+				$next_chunk = fread( $input_stream, $buffer_size );
+				if ( $next_chunk === false ) {
+					$next_chunk = '';
+				}
+
+				$processor = new WP_XML_Processor(
+					$processor->get_unprocessed_xml() . $next_chunk,
+					$processor->get_breadcrumbs(),
+					$processor->get_parser_context()
+				);
+				// To make sure <?xml tokens won't be treated as XML declaration
+				// but as processing instructions
+				$processor->had_previous_chunks = true;
+				continue;
+			} elseif ( null !== $processor->get_last_error() ) {
+				throw new Exception( $processor->get_last_error() );
+			} elseif ( ! $token_found ) {
+				fwrite( $output_stream, $processor->get_updated_xml() );
+				// finished
+				return true;
+			}
+			yield $processor;
+		}
+	}
+
 	/**
 	 * Constructor.
 	 *
 	 * @since WP_VERSION
 	 *
 	 * @param string $xml XML to process.
 	 */
-	public function __construct( $xml ) {
+	public function __construct( $xml, $breadcrumbs = array(), $parser_context = self::IN_PROLOG_CONTEXT ) {
 		parent::__construct( $xml );
+		$this->stack_of_open_elements = $breadcrumbs;
+		$this->parser_context         = $parser_context;
+	}
+
+	public function get_parser_context() {
+		return $this->parser_context;
 	}
 
 	/**
@@ -149,7 +193,7 @@ public function next_tag( $query = null ) {
 		return false;
 	}
 
-	/**
+	/*
 	 * Sets a bookmark in the XML document.
 	 *
 	 * Bookmarks represent specific places or tokens in the HTML

diff --git a/src/wp-includes/html-api/class-wp-xml-tag-processor.php b/src/wp-includes/html-api/class-wp-xml-tag-processor.php
@@ -618,6 +618,8 @@ class WP_XML_Tag_Processor {
 	 */
 	protected $seek_count = 0;
 
+	public $had_previous_chunks = false;
+
 	/**
 	 * Constructor.
 	 *
@@ -1358,6 +1360,7 @@ private function parse_next_tag() {
 			 */
 			if (
 				0 === $at &&
+				! $this->had_previous_chunks &&
 				! $this->is_closing_tag &&
 				'?' === $xml[ $at + 1 ] &&
 				'x' === $xml[ $at + 2 ] &&
@@ -1483,8 +1486,13 @@ private function parse_next_tag() {
 				! $this->is_closing_tag &&
 				'?' === $xml[ $at + 1 ]
 			) {
+				if ( $at + 4 >= $doc_length ) {
+					$this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
+					return false;
+				}
+
 				if ( ! (
-					$at + 4 <= $doc_length &&
 					( 'x' === $xml[ $at + 2 ] || 'X' === $xml[ $at + 2 ] ) &&
 					( 'm' === $xml[ $at + 3 ] || 'M' === $xml[ $at + 3 ] ) &&
 					( 'l' === $xml[ $at + 4 ] || 'L' === $xml[ $at + 4 ] )
@@ -1548,7 +1556,6 @@ private function parse_next_attribute() {
 		$this->bytes_already_parsed += strspn( $this->xml, " \t\f\r\n/", $this->bytes_already_parsed );
 		if ( $this->bytes_already_parsed >= strlen( $this->xml ) ) {
 			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
-
 			return false;
 		}
 
@@ -1576,7 +1583,6 @@ private function parse_next_attribute() {
 		$this->skip_whitespace();
 		if ( $this->bytes_already_parsed >= strlen( $this->xml ) ) {
 			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
-
 			return false;
 		}
 		switch ( $this->xml[ $this->bytes_already_parsed ] ) {
@@ -1855,6 +1861,19 @@ public function has_bookmark( $bookmark_name ) {
 		return array_key_exists( $bookmark_name, $this->bookmarks );
 	}
 
+	public function get_processed_xml() {
+		// Flush updates
+		$this->get_updated_xml();
+		return substr( $this->xml, 0, $this->bytes_already_parsed );
+	}
+
+	public function get_unprocessed_xml() {
+		// Flush updates
+		$this->get_updated_xml();
+		return substr( $this->xml, $this->bytes_already_parsed );
+	}
+
+
 	/**
 	 * Move the internal cursor in the Tag Processor to a given bookmark's location.
 	 *
@@ -2295,6 +2314,7 @@ public function get_modifiable_text() {
 			 */
 
 			$this->last_error = self::ERROR_SYNTAX;
+			var_dump( $text );
 			_doing_it_wrong(
 				__METHOD__,
 				__( 'Invalid text content encountered.' ),
@@ -2305,6 +2325,36 @@ public function get_modifiable_text() {
 		return $decoded;
 	}
 
+	public function set_modifiable_text( $new_value ) {
+		switch ( $this->parser_state ) {
+			case self::STATE_TEXT_NODE:
+			case self::STATE_COMMENT:
+				$this->lexical_updates[] = new WP_HTML_Text_Replacement(
+					$this->text_starts_at,
+					$this->text_length,
+					// @TODO This is naive, let's rethink this.
+					htmlspecialchars( $new_value, ENT_XML1, 'UTF-8' )
+				);
+				return true;
+
+			case self::STATE_CDATA_NODE:
+				$this->lexical_updates[] = new WP_HTML_Text_Replacement(
+					$this->text_starts_at,
+					$this->text_length,
+					// @TODO This is naive, let's rethink this.
+					str_replace( ']]>', ']]&gt;', $new_value )
+				);
+				return true;
+			default:
+				_doing_it_wrong(
+					__METHOD__,
+					__( 'Cannot set text content on a non-text node.' ),
+					'WP_VERSION'
+				);
+				return false;
+		}
+	}
+
 	/**
 	 * Updates or creates a new attribute on the currently matched tag with the passed value.
 	 *