From ec86b89d67fac1d8f71b6315cd895ac2616cc7eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 4 Jun 2024 17:06:49 +0200 Subject: [PATCH 1/8] =?UTF-8?q?XML=20Stream=20Processing=20=E2=80=93=20Pro?= =?UTF-8?q?of=20of=20concept?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../html-api/class-wp-xml-processor.php | 76 ++-- .../html-api/class-wp-xml-tag-processor.php | 6 + src/wp-includes/html-api/test.php | 118 ++++++ src/wp-includes/html-api/test.wxr | 353 ++++++++++++++++++ 4 files changed, 522 insertions(+), 31 deletions(-) create mode 100644 src/wp-includes/html-api/test.php create mode 100644 src/wp-includes/html-api/test.wxr diff --git a/src/wp-includes/html-api/class-wp-xml-processor.php b/src/wp-includes/html-api/class-wp-xml-processor.php index 896ce4ada2f0d..d5e218ae95731 100644 --- a/src/wp-includes/html-api/class-wp-xml-processor.php +++ b/src/wp-includes/html-api/class-wp-xml-processor.php @@ -57,8 +57,15 @@ class WP_XML_Processor extends WP_XML_Tag_Processor { * * @param string $xml XML to process. */ - public function __construct( $xml ) { + public function __construct( $xml, $breadcrumbs = array(), $parser_context = self::IN_PROLOG_CONTEXT ) { parent::__construct( $xml ); + $this->stack_of_open_elements = $breadcrumbs; + $this->parser_context = $parser_context; + } + + public function get_parser_context() + { + return $this->parser_context; } /** @@ -180,38 +187,41 @@ public function get_inner_text() { return $this->get_modifiable_text(); } - try { - $text = ''; - $depth = 1; - while ( $depth > 0 && $this->base_class_next_token() ) { - switch ( $this->get_token_type() ) { - case '#tag': - if ( $this->is_empty_element() ) { - continue 2; - } - if ( $this->is_tag_closer() ) { - --$depth; - } else { - ++$depth; - } - $text .= $this->get_modifiable_text(); - break; - case '#text': - case '#cdata-section': - if ( $depth > 0 ) { - $text .= $this->get_modifiable_text(); - } - break; - default: + $text = ''; + $depth = 1; + do { + switch ( $this->get_token_type() ) { + case '#tag': + if ( $this->is_empty_element() ) { continue 2; - } + } + if ( $this->is_tag_closer() ) { + --$depth; + } else { + ++$depth; + } + $text .= $this->get_modifiable_text(); + break; + case '#text': + case '#cdata-section': + if ( $depth > 0 ) { + $text .= $this->get_modifiable_text(); + } + break; + default: + continue 2; } + } while ( $depth > 0 && $this->base_class_next_token() ); + + $this->seek( 'inner_text' ); + $this->release_bookmark( 'inner_text' ); - return $text; - } finally { - $this->seek( 'inner_text' ); - $this->release_bookmark( 'inner_text' ); + if( $depth !== 0 ) { + $this->parser_state = WP_XML_Tag_Processor::STATE_INCOMPLETE_INPUT; + return false; } + + return $text; } /** @@ -357,7 +367,7 @@ public function has_bookmark( $bookmark_name ) { * @return false */ public function next_token() { - return $this->next_tag(); + return $this->step(); } /** @@ -590,7 +600,11 @@ public function matches_breadcrumbs( $breadcrumbs ) { // Start at the last crumb. $crumb = end( $breadcrumbs ); - if ( '*' !== $crumb && $this->get_tag() !== $crumb ) { + if ( + '#tag' === $this->get_token_type() && + '*' !== $crumb && + $this->get_tag() !== $crumb + ) { return false; } diff --git a/src/wp-includes/html-api/class-wp-xml-tag-processor.php b/src/wp-includes/html-api/class-wp-xml-tag-processor.php index 6b77884b2427c..a83df2f3efc6a 100644 --- a/src/wp-includes/html-api/class-wp-xml-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-xml-tag-processor.php @@ -1831,6 +1831,12 @@ public function has_bookmark( $bookmark_name ) { return array_key_exists( $bookmark_name, $this->bookmarks ); } + public function get_unparsed_xml() + { + return substr($this->xml, $this->bytes_already_parsed); + } + + /** * Move the internal cursor in the Tag Processor to a given bookmark's location. * diff --git a/src/wp-includes/html-api/test.php b/src/wp-includes/html-api/test.php new file mode 100644 index 0000000000000..0d6bbeeb2ea15 --- /dev/null +++ b/src/wp-includes/html-api/test.php @@ -0,0 +1,118 @@ +The & < > " ' 🅰 � “😄” ' +// // 'The open source publishing platform of choice for millions of websites worldwide—from creators and small businesses' +// ); +// // $processor->declare_element_as_pcdata('wp:post'); +// $processor->next_tag('wp:post'); +// var_dump($processor->get_modifiable_text()); +// var_dump($processor->get_inner_text()); +// var_dump(phpversion()); +// $xml = ''; +// $expected_match = ''; +// $match_nth_token = 1; +// $processor = new class( $xml ) extends WP_XML_Tag_Processor { +// /** +// * Returns the raw span of XML for the currently-matched +// * token, or null if not paused on any token. +// * +// * @return string|null Raw XML content of currently-matched token, +// * otherwise `null` if not matched. +// */ +// public function get_raw_token() { +// if ( +// WP_XML_Tag_Processor::STATE_READY === $this->parser_state || +// WP_XML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state || +// WP_XML_Tag_Processor::STATE_COMPLETE === $this->parser_state +// ) { +// return null; +// } + +// $this->set_bookmark( 'mark' ); +// $mark = $this->bookmarks['mark']; + +// return substr( $this->xml, $mark->start, $mark->length ); +// } +// }; + +// for ( $i = 0; $i < $match_nth_token; $i++ ) { +// $processor->next_token(); +// } + +// $raw_token = $processor->get_raw_token(); +// var_dump($raw_token); +// var_dump($expected_match); +// die(); + +// $i = 0; +// while( $processor->next_tag(array()) ) { +// echo "\n " . dump_token($processor); +// echo $processor->get_inner_text(); +// } +// die(); + + + +function chunk_text($text) { + $length = strlen($text); + $chunks_nb = ceil($length / 1000); + for( $i = 0; $i < $chunks_nb; $i++ ) { + yield substr($text, $i * 1000, 1000); + } +} + +function stream_next_xml_token( $get_next_chunk ) { + $streamed_data = $get_next_chunk->current(); + $get_next_chunk->next(); + + $breadcrumbs = array(); + $parser_context = WP_XML_Processor::IN_PROLOG_CONTEXT; + $processor = new WP_XML_Processor( $streamed_data, $breadcrumbs, $parser_context ); + while(true) { + $token_found = $processor->next_token(); + + if ($processor->paused_at_incomplete_token()) { + $processor = new WP_XML_Processor( + // ' ' is a hack to avoid treating repeated get_unparsed_xml() . $get_next_chunk->current(), + $processor->get_breadcrumbs(), + $processor->get_parser_context() + ); + $get_next_chunk->next(); + continue; + } else if($processor->get_last_error() !== null) { + echo "\n ERROR: "; + var_dump($processor->get_last_error()); + return false; + } else if (!$token_found) { + // finished + return true; + } + yield $processor; + } +} + + +$wxr = file_get_contents(__DIR__ . '/test.wxr'); +$tokens = stream_next_xml_token(chunk_text($wxr)); +foreach($tokens as $processor) { + if ($processor->get_token_type() === '#cdata-section' && $processor->matches_breadcrumbs(array('content:encoded'))) { + echo "\n " . dump_token($processor); + } + // echo "\n " . dump_token($processor); +} diff --git a/src/wp-includes/html-api/test.wxr b/src/wp-includes/html-api/test.wxr new file mode 100644 index 0000000000000..b3b59f007a5e4 --- /dev/null +++ b/src/wp-includes/html-api/test.wxr @@ -0,0 +1,353 @@ + + + + + + + + + + + + + + + + + + + + + + Long Bets + http://longbets.org + The Arena for Accountable Predictions + Tue, 25 Jan 2011 06:50:19 +0000 + http://wordpress.org/?v=3.0.4 + en + 1.0 + http://longbets.org + http://longbets.org + + uncategorized + + + + http://wordpress.org/?v=3.0.4 + + + Hello world! + http://longbets.org/?p=1 + Tue, 25 Jan 2011 06:47:53 +0000 + + + + http://longbets.org/?p=1 + + + + 1 + 2011-01-25 06:47:53 + 2011-01-25 06:47:53 + open + open + hello-world + publish + 0 + 0 + post + + 0 + + 1 + + + http://wordpress.org/ + + 2011-01-25 06:47:53 + 2011-01-25 06:47:53 + To delete a comment, just log in and view the post's comments. There you will have the option to edit or delete them.]]> + 1 + + 0 + 0 + + + + + + Second Post, by me! + http://longbets.org/?p=4 + Tue, 25 Jan 2011 06:49:20 +0000 + + + + http://longbets.org/?p=4 + + Lorizzle that's the shizzle dolor break yo neck, yall mah nizzle, pot adipiscing sizzle. Away sapien velizzle, sizzle volutpizzle, suscipizzle quizzle, gravida vizzle, fo shizzle my nizzle. For sure check out this tortor. Sizzle sure. Fusce izzle dolor dapibizzle for sure crunk. Mauris shizzlin dizzle nibh izzle turpis. Cool izzle tortor. Pellentesque mofo bling bling shizzlin dizzle. Dope hizzle mammasay mammasa mamma oo sa break yo neck, yall dictumst. I saw beyonces tizzles and my pizzle went crizzle dapibizzle. Yo mamma shizzle my nizzle crocodizzle urna, pretizzle eu, stuff rizzle, eleifend tellivizzle, nunc. For sure suscipizzle. Integizzle sempizzle own yo' sizzle fo.]]> + + 4 + 2011-01-25 06:49:20 + 2011-01-25 06:49:20 + open + open + second-post-by-me + publish + 0 + 0 + post + + 0 + + _edit_last + + + + _edit_lock + + + + _encloseme + + + + _wp_old_slug + + + + 2 + + bkeating@gmail.com + + 192.168.1.101 + 2011-01-25 06:52:17 + 2011-01-25 06:52:17 + + 1 + + 0 + 1 + + + 3 + + bkeating@gmail.com + + 192.168.1.101 + 2011-01-25 06:52:37 + 2011-01-25 06:52:37 + + 1 + + 0 + 1 + + + 4 + + bkeating@gmail.com + + 192.168.1.101 + 2011-01-25 06:52:59 + 2011-01-25 06:52:59 + + 1 + + 0 + 1 + + + + + + Third Post + http://longbets.org/?p=6 + Tue, 25 Jan 2011 06:49:59 +0000 + + + + http://longbets.org/?p=6 + + Lorizzle that's the shizzle dolor break yo neck, yall mah nizzle, pot adipiscing sizzle. Away sapien velizzle, sizzle volutpizzle, suscipizzle quizzle, gravida vizzle, fo shizzle my nizzle. For sure check out this tortor. Sizzle sure. Fusce izzle dolor dapibizzle for sure crunk. Mauris shizzlin dizzle nibh izzle turpis. Cool izzle tortor. Pellentesque mofo bling bling shizzlin dizzle. Dope hizzle mammasay mammasa mamma oo sa break yo neck, yall dictumst. I saw beyonces tizzles and my pizzle went crizzle dapibizzle. Yo mamma shizzle my nizzle crocodizzle urna, pretizzle eu, stuff rizzle, eleifend tellivizzle, nunc. For sure suscipizzle. Integizzle sempizzle own yo' sizzle fo.]]> + + 6 + 2011-01-25 06:49:59 + 2011-01-25 06:49:59 + open + open + third-post + publish + 0 + 0 + post + + 0 + + _edit_last + + + + _edit_lock + + + + _encloseme + + + + _wp_old_slug + + + + 5 + + bkeating@gmail.com + + 192.168.1.101 + 2011-01-25 06:53:23 + 2011-01-25 06:53:23 + + 1 + + 0 + 1 + + + 6 + + bkeating@gmail.com + + 192.168.1.101 + 2011-01-25 06:53:29 + 2011-01-25 06:53:29 + + 1 + + 5 + 1 + + + + + + Fourth Post + http://longbets.org/?p=8 + Tue, 25 Jan 2011 06:50:09 +0000 + + + + http://longbets.org/?p=8 + + + + 8 + 2011-01-25 06:50:09 + 2011-01-25 06:50:09 + open + open + fourth-post + publish + 0 + 0 + post + + 0 + + _edit_last + + + + _edit_lock + + + + _encloseme + + + + _wp_old_slug + + + + + + + Fifth Post + http://longbets.org/?p=10 + Tue, 25 Jan 2011 06:50:19 +0000 + + + + http://longbets.org/?p=10 + + + + 10 + 2011-01-25 06:50:19 + 2011-01-25 06:50:19 + open + open + fifth-post + publish + 0 + 0 + post + + 0 + + _edit_last + + + + _edit_lock + + + + _encloseme + + + + _wp_old_slug + + + + 7 + + bkeating@gmail.com + + 192.168.1.101 + 2011-01-25 06:53:52 + 2011-01-25 06:53:52 + + 1 + + 0 + 1 + + + 8 + + bkeating@gmail.com + + 192.168.1.101 + 2011-01-25 06:53:58 + 2011-01-25 06:53:58 + + 1 + + 0 + 1 + + + + + + \ No newline at end of file From daf243faf70f52c0ec4375f64e0d98e3e061fabd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 4 Jun 2024 17:10:37 +0200 Subject: [PATCH 2/8] Test get_inner_text --- src/wp-includes/html-api/test.php | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/wp-includes/html-api/test.php b/src/wp-includes/html-api/test.php index 0d6bbeeb2ea15..221e5c139ad3c 100644 --- a/src/wp-includes/html-api/test.php +++ b/src/wp-includes/html-api/test.php @@ -116,3 +116,17 @@ function stream_next_xml_token( $get_next_chunk ) { } // echo "\n " . dump_token($processor); } + +function dump_token(WP_XML_Processor $p) { + $result = $p->get_token_type() . ' '; + switch($p->get_token_type()) { + case '#tag': + $result .= '(' . $p->get_token_name() . ')' . ' IN ' . implode( ' > ', $p->get_breadcrumbs() ); + break; + case '#text': + case '#cdata-section': + $result .= '(' . preg_replace('~\s+~', ' ', $p->get_inner_text()) . ')'; + break; + } + return $result; +} \ No newline at end of file From e2024a12510e456bf25c45acae01a067cdccd607 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 4 Jun 2024 17:17:20 +0200 Subject: [PATCH 3/8] Remove get_inner_text --- src/wp-includes/html-api/test.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/test.php b/src/wp-includes/html-api/test.php index 221e5c139ad3c..3df1dc36e8cc4 100644 --- a/src/wp-includes/html-api/test.php +++ b/src/wp-includes/html-api/test.php @@ -125,7 +125,7 @@ function dump_token(WP_XML_Processor $p) { break; case '#text': case '#cdata-section': - $result .= '(' . preg_replace('~\s+~', ' ', $p->get_inner_text()) . ')'; + $result .= '(' . preg_replace('~\s+~', ' ', $p->get_modifiable_text()) . ')'; break; } return $result; From 3328387851fa2adb0b16d2e66469c2f833a233f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 4 Jun 2024 17:17:52 +0200 Subject: [PATCH 4/8] Cleanup test.php --- src/wp-includes/html-api/test.php | 75 ++++--------------------------- 1 file changed, 8 insertions(+), 67 deletions(-) diff --git a/src/wp-includes/html-api/test.php b/src/wp-includes/html-api/test.php index 3df1dc36e8cc4..fa161e7daa10c 100644 --- a/src/wp-includes/html-api/test.php +++ b/src/wp-includes/html-api/test.php @@ -9,64 +9,15 @@ require __DIR__ . "/class-wp-xml-tag-processor.php"; require __DIR__ . "/class-wp-xml-processor.php"; -// var_dump( -// WP_XML_Decoder::decode('“““') // The & < > " ' 🅰 � “😄”') -// ); - -// die(); -// $processor = new WP_XML_Processor( -// 'The & < > " ' 🅰 � “😄” ' -// // 'The open source publishing platform of choice for millions of websites worldwide—from creators and small businesses' -// ); -// // $processor->declare_element_as_pcdata('wp:post'); -// $processor->next_tag('wp:post'); -// var_dump($processor->get_modifiable_text()); -// var_dump($processor->get_inner_text()); -// var_dump(phpversion()); -// $xml = ''; -// $expected_match = ''; -// $match_nth_token = 1; -// $processor = new class( $xml ) extends WP_XML_Tag_Processor { -// /** -// * Returns the raw span of XML for the currently-matched -// * token, or null if not paused on any token. -// * -// * @return string|null Raw XML content of currently-matched token, -// * otherwise `null` if not matched. -// */ -// public function get_raw_token() { -// if ( -// WP_XML_Tag_Processor::STATE_READY === $this->parser_state || -// WP_XML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state || -// WP_XML_Tag_Processor::STATE_COMPLETE === $this->parser_state -// ) { -// return null; -// } - -// $this->set_bookmark( 'mark' ); -// $mark = $this->bookmarks['mark']; - -// return substr( $this->xml, $mark->start, $mark->length ); -// } -// }; - -// for ( $i = 0; $i < $match_nth_token; $i++ ) { -// $processor->next_token(); -// } - -// $raw_token = $processor->get_raw_token(); -// var_dump($raw_token); -// var_dump($expected_match); -// die(); - -// $i = 0; -// while( $processor->next_tag(array()) ) { -// echo "\n " . dump_token($processor); -// echo $processor->get_inner_text(); -// } -// die(); - +$wxr = file_get_contents(__DIR__ . '/test.wxr'); +$tokens = stream_next_xml_token(chunk_text($wxr)); +foreach($tokens as $processor) { + if ($processor->get_token_type() === '#cdata-section' && $processor->matches_breadcrumbs(array('content:encoded'))) { + echo "\n " . dump_token($processor); + } + // echo "\n " . dump_token($processor); +} function chunk_text($text) { $length = strlen($text); @@ -107,16 +58,6 @@ function stream_next_xml_token( $get_next_chunk ) { } } - -$wxr = file_get_contents(__DIR__ . '/test.wxr'); -$tokens = stream_next_xml_token(chunk_text($wxr)); -foreach($tokens as $processor) { - if ($processor->get_token_type() === '#cdata-section' && $processor->matches_breadcrumbs(array('content:encoded'))) { - echo "\n " . dump_token($processor); - } - // echo "\n " . dump_token($processor); -} - function dump_token(WP_XML_Processor $p) { $result = $p->get_token_type() . ' '; switch($p->get_token_type()) { From 30fd26f0d39d3345f177a35ebca2738b67c40e8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 4 Jun 2024 17:18:00 +0200 Subject: [PATCH 5/8] Revove commented code --- src/wp-includes/html-api/test.php | 1 - 1 file changed, 1 deletion(-) diff --git a/src/wp-includes/html-api/test.php b/src/wp-includes/html-api/test.php index fa161e7daa10c..95bb29369dc93 100644 --- a/src/wp-includes/html-api/test.php +++ b/src/wp-includes/html-api/test.php @@ -16,7 +16,6 @@ if ($processor->get_token_type() === '#cdata-section' && $processor->matches_breadcrumbs(array('content:encoded'))) { echo "\n " . dump_token($processor); } - // echo "\n " . dump_token($processor); } function chunk_text($text) { From 577dee59fe7111593ea3926a68143be80eeb0832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 4 Jun 2024 17:18:16 +0200 Subject: [PATCH 6/8] Whitespaces --- src/wp-includes/html-api/test.php | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/test.php b/src/wp-includes/html-api/test.php index 95bb29369dc93..f885cbde14fe6 100644 --- a/src/wp-includes/html-api/test.php +++ b/src/wp-includes/html-api/test.php @@ -13,7 +13,10 @@ $wxr = file_get_contents(__DIR__ . '/test.wxr'); $tokens = stream_next_xml_token(chunk_text($wxr)); foreach($tokens as $processor) { - if ($processor->get_token_type() === '#cdata-section' && $processor->matches_breadcrumbs(array('content:encoded'))) { + if ( + $processor->get_token_type() === '#cdata-section' && + $processor->matches_breadcrumbs(array('content:encoded')) + ) { echo "\n " . dump_token($processor); } } From 5c489991696385e47538ad2d20b2a56837f21c3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 6 Jun 2024 11:12:51 +0200 Subject: [PATCH 7/8] Support streaming XML --- .../html-api/class-wp-xml-processor.php | 38 ++ .../html-api/class-wp-xml-tag-processor.php | 60 ++- src/wp-includes/html-api/test.wxr | 353 ------------------ .../phpunit/tests/html-api/wpXmlProcessor.php | 26 ++ .../tests/html-api/wpXmlTagProcessor.php | 24 ++ 5 files changed, 141 insertions(+), 360 deletions(-) delete mode 100644 src/wp-includes/html-api/test.wxr diff --git a/src/wp-includes/html-api/class-wp-xml-processor.php b/src/wp-includes/html-api/class-wp-xml-processor.php index 1f71e2b27274d..d3e5f40c8998d 100644 --- a/src/wp-includes/html-api/class-wp-xml-processor.php +++ b/src/wp-includes/html-api/class-wp-xml-processor.php @@ -50,6 +50,44 @@ class WP_XML_Processor extends WP_XML_Tag_Processor { */ public $stack_of_open_elements = array(); + public static function stream_next_xml_token( $input_stream, $output_stream, $buffer_size = 8092 ) { + $streamed_data = fread( $input_stream, $buffer_size ); + + $breadcrumbs = array(); + $parser_context = WP_XML_Processor::IN_PROLOG_CONTEXT; + $processor = new WP_XML_Processor( $streamed_data, $breadcrumbs, $parser_context ); + while ( true ) { + $token_found = $processor->next_token(); + $processor->get_updated_xml(); + + if ( $processor->paused_at_incomplete_token() ) { + fwrite( $output_stream, $processor->get_processed_xml() ); + + $next_chunk = fread( $input_stream, $buffer_size ); + if ( $next_chunk === false ) { + $next_chunk = ''; + } + + $processor = new WP_XML_Processor( + $processor->get_unprocessed_xml() . $next_chunk, + $processor->get_breadcrumbs(), + $processor->get_parser_context() + ); + // To make sure had_previous_chunks = true; + continue; + } elseif ( null !== $processor->get_last_error() ) { + throw new Exception( $processor->get_last_error() ); + } elseif ( ! $token_found ) { + fwrite( $output_stream, $processor->get_updated_xml() ); + // finished + return true; + } + yield $processor; + } + } + /** * Constructor. * diff --git a/src/wp-includes/html-api/class-wp-xml-tag-processor.php b/src/wp-includes/html-api/class-wp-xml-tag-processor.php index 21ee7dedbc62e..caaa958d56a76 100644 --- a/src/wp-includes/html-api/class-wp-xml-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-xml-tag-processor.php @@ -602,6 +602,8 @@ class WP_XML_Tag_Processor { */ protected $seek_count = 0; + public $had_previous_chunks = false; + /** * Constructor. * @@ -1167,11 +1169,13 @@ private function parse_next_tag() { $at = strpos( $xml, '<', $at ); /* - * This does not imply an incomplete parse; it indicates that there - * can be nothing left in the document other than a #text node. + * It's probably an incomplete parse – there can be no text + * nodes outside of elements. */ if ( false === $at ) { - $at = strlen( $xml ); + // @TODO: Ignore whitespace at the end of the document. + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; } if ( $at > $was_at ) { @@ -1335,6 +1339,7 @@ private function parse_next_tag() { */ if ( 0 === $at && + ! $this->had_previous_chunks && ! $this->is_closing_tag && '?' === $xml[ $at + 1 ] && 'x' === $xml[ $at + 2 ] && @@ -1460,8 +1465,13 @@ private function parse_next_tag() { ! $this->is_closing_tag && '?' === $xml[ $at + 1 ] ) { + if ( $at + 4 >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + + return false; + } + if ( ! ( - $at + 4 <= $doc_length && ( 'x' === $xml[ $at + 2 ] || 'X' === $xml[ $at + 2 ] ) && ( 'm' === $xml[ $at + 3 ] || 'M' === $xml[ $at + 3 ] ) && ( 'l' === $xml[ $at + 4 ] || 'L' === $xml[ $at + 4 ] ) @@ -1525,7 +1535,6 @@ private function parse_next_attribute() { $this->bytes_already_parsed += strspn( $this->xml, " \t\f\r\n/", $this->bytes_already_parsed ); if ( $this->bytes_already_parsed >= strlen( $this->xml ) ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; - return false; } @@ -1553,7 +1562,6 @@ private function parse_next_attribute() { $this->skip_whitespace(); if ( $this->bytes_already_parsed >= strlen( $this->xml ) ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; - return false; } switch ( $this->xml[ $this->bytes_already_parsed ] ) { @@ -1831,7 +1839,15 @@ public function has_bookmark( $bookmark_name ) { return array_key_exists( $bookmark_name, $this->bookmarks ); } - public function get_unparsed_xml() { + public function get_processed_xml() { + // Flush updates + $this->get_updated_xml(); + return substr( $this->xml, 0, $this->bytes_already_parsed ); + } + + public function get_unprocessed_xml() { + // Flush updates + $this->get_updated_xml(); return substr( $this->xml, $this->bytes_already_parsed ); } @@ -2286,6 +2302,36 @@ public function get_modifiable_text() { return $decoded; } + public function set_modifiable_text( $new_value ) { + switch ( $this->parser_state ) { + case self::STATE_TEXT_NODE: + case self::STATE_COMMENT: + $this->lexical_updates[] = new WP_HTML_Text_Replacement( + $this->text_starts_at, + $this->text_length, + // @TODO This is naive, let's rethink this. + htmlspecialchars( $new_value, ENT_XML1, 'UTF-8' ) + ); + return true; + + case self::STATE_CDATA_NODE: + $this->lexical_updates[] = new WP_HTML_Text_Replacement( + $this->text_starts_at, + $this->text_length, + // @TODO This is naive, let's rethink this. + str_replace( ']]>', ']]>', $new_value ) + ); + return true; + default: + _doing_it_wrong( + __METHOD__, + __( 'Cannot set text content on a non-text node.' ), + 'WP_VERSION' + ); + return false; + } + } + /** * Updates or creates a new attribute on the currently matched tag with the passed value. * diff --git a/src/wp-includes/html-api/test.wxr b/src/wp-includes/html-api/test.wxr deleted file mode 100644 index b3b59f007a5e4..0000000000000 --- a/src/wp-includes/html-api/test.wxr +++ /dev/null @@ -1,353 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - Long Bets - http://longbets.org - The Arena for Accountable Predictions - Tue, 25 Jan 2011 06:50:19 +0000 - http://wordpress.org/?v=3.0.4 - en - 1.0 - http://longbets.org - http://longbets.org - - uncategorized - - - - http://wordpress.org/?v=3.0.4 - - - Hello world! - http://longbets.org/?p=1 - Tue, 25 Jan 2011 06:47:53 +0000 - - - - http://longbets.org/?p=1 - - - - 1 - 2011-01-25 06:47:53 - 2011-01-25 06:47:53 - open - open - hello-world - publish - 0 - 0 - post - - 0 - - 1 - - - http://wordpress.org/ - - 2011-01-25 06:47:53 - 2011-01-25 06:47:53 - To delete a comment, just log in and view the post's comments. There you will have the option to edit or delete them.]]> - 1 - - 0 - 0 - - - - - - Second Post, by me! - http://longbets.org/?p=4 - Tue, 25 Jan 2011 06:49:20 +0000 - - - - http://longbets.org/?p=4 - - Lorizzle that's the shizzle dolor break yo neck, yall mah nizzle, pot adipiscing sizzle. Away sapien velizzle, sizzle volutpizzle, suscipizzle quizzle, gravida vizzle, fo shizzle my nizzle. For sure check out this tortor. Sizzle sure. Fusce izzle dolor dapibizzle for sure crunk. Mauris shizzlin dizzle nibh izzle turpis. Cool izzle tortor. Pellentesque mofo bling bling shizzlin dizzle. Dope hizzle mammasay mammasa mamma oo sa break yo neck, yall dictumst. I saw beyonces tizzles and my pizzle went crizzle dapibizzle. Yo mamma shizzle my nizzle crocodizzle urna, pretizzle eu, stuff rizzle, eleifend tellivizzle, nunc. For sure suscipizzle. Integizzle sempizzle own yo' sizzle fo.]]> - - 4 - 2011-01-25 06:49:20 - 2011-01-25 06:49:20 - open - open - second-post-by-me - publish - 0 - 0 - post - - 0 - - _edit_last - - - - _edit_lock - - - - _encloseme - - - - _wp_old_slug - - - - 2 - - bkeating@gmail.com - - 192.168.1.101 - 2011-01-25 06:52:17 - 2011-01-25 06:52:17 - - 1 - - 0 - 1 - - - 3 - - bkeating@gmail.com - - 192.168.1.101 - 2011-01-25 06:52:37 - 2011-01-25 06:52:37 - - 1 - - 0 - 1 - - - 4 - - bkeating@gmail.com - - 192.168.1.101 - 2011-01-25 06:52:59 - 2011-01-25 06:52:59 - - 1 - - 0 - 1 - - - - - - Third Post - http://longbets.org/?p=6 - Tue, 25 Jan 2011 06:49:59 +0000 - - - - http://longbets.org/?p=6 - - Lorizzle that's the shizzle dolor break yo neck, yall mah nizzle, pot adipiscing sizzle. Away sapien velizzle, sizzle volutpizzle, suscipizzle quizzle, gravida vizzle, fo shizzle my nizzle. For sure check out this tortor. Sizzle sure. Fusce izzle dolor dapibizzle for sure crunk. Mauris shizzlin dizzle nibh izzle turpis. Cool izzle tortor. Pellentesque mofo bling bling shizzlin dizzle. Dope hizzle mammasay mammasa mamma oo sa break yo neck, yall dictumst. I saw beyonces tizzles and my pizzle went crizzle dapibizzle. Yo mamma shizzle my nizzle crocodizzle urna, pretizzle eu, stuff rizzle, eleifend tellivizzle, nunc. For sure suscipizzle. Integizzle sempizzle own yo' sizzle fo.]]> - - 6 - 2011-01-25 06:49:59 - 2011-01-25 06:49:59 - open - open - third-post - publish - 0 - 0 - post - - 0 - - _edit_last - - - - _edit_lock - - - - _encloseme - - - - _wp_old_slug - - - - 5 - - bkeating@gmail.com - - 192.168.1.101 - 2011-01-25 06:53:23 - 2011-01-25 06:53:23 - - 1 - - 0 - 1 - - - 6 - - bkeating@gmail.com - - 192.168.1.101 - 2011-01-25 06:53:29 - 2011-01-25 06:53:29 - - 1 - - 5 - 1 - - - - - - Fourth Post - http://longbets.org/?p=8 - Tue, 25 Jan 2011 06:50:09 +0000 - - - - http://longbets.org/?p=8 - - - - 8 - 2011-01-25 06:50:09 - 2011-01-25 06:50:09 - open - open - fourth-post - publish - 0 - 0 - post - - 0 - - _edit_last - - - - _edit_lock - - - - _encloseme - - - - _wp_old_slug - - - - - - - Fifth Post - http://longbets.org/?p=10 - Tue, 25 Jan 2011 06:50:19 +0000 - - - - http://longbets.org/?p=10 - - - - 10 - 2011-01-25 06:50:19 - 2011-01-25 06:50:19 - open - open - fifth-post - publish - 0 - 0 - post - - 0 - - _edit_last - - - - _edit_lock - - - - _encloseme - - - - _wp_old_slug - - - - 7 - - bkeating@gmail.com - - 192.168.1.101 - 2011-01-25 06:53:52 - 2011-01-25 06:53:52 - - 1 - - 0 - 1 - - - 8 - - bkeating@gmail.com - - 192.168.1.101 - 2011-01-25 06:53:58 - 2011-01-25 06:53:58 - - 1 - - 0 - 1 - - - - - - \ No newline at end of file diff --git a/tests/phpunit/tests/html-api/wpXmlProcessor.php b/tests/phpunit/tests/html-api/wpXmlProcessor.php index bb14218ac167e..198302731015a 100644 --- a/tests/phpunit/tests/html-api/wpXmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpXmlProcessor.php @@ -51,6 +51,32 @@ public function test_get_breadcrumbs() { $this->assertFalse( $processor->next_tag() ); } + /** + * @ticket 61365 + * + * @return void + */ + public function test_streaming() { + $input_stream = fopen( 'php://memory', 'rw+' ); + fwrite( $input_stream, 'old test

]]>
' ); + rewind( $input_stream ); + $output_stream = fopen( 'php://memory', 'rw+' ); + $stream = WP_XML_Processor::stream_next_xml_token( + $input_stream, + $output_stream + ); + foreach ( $stream as $processor ) { + if ( $processor->get_token_type() === '#cdata-section' ) { + $processor->set_modifiable_text( 'new text' ); + } + } + rewind( $output_stream ); + $this->assertEquals( + '', + stream_get_contents( $output_stream ) + ); + } + /** * @ticket 61365 * diff --git a/tests/phpunit/tests/html-api/wpXmlTagProcessor.php b/tests/phpunit/tests/html-api/wpXmlTagProcessor.php index 8b2efedb9c7fa..c1bbeb5b4c96e 100644 --- a/tests/phpunit/tests/html-api/wpXmlTagProcessor.php +++ b/tests/phpunit/tests/html-api/wpXmlTagProcessor.php @@ -1184,6 +1184,30 @@ public static function data_incomplete_syntax_elements() { ); } + /** + * Ensures that the processor doesn't attempt to match an incomplete text node. + * + * @ticket 61365 + * + * @covers WP_XML_Tag_Processor::next_tag + * @covers WP_XML_Tag_Processor::paused_at_incomplete_token + */ + public function test_next_tag_returns_false_for_incomplete_text_nodes() { + $processor = new WP_XML_Tag_Processor( 'There is no closer!' ); + + $this->assertTrue( + $processor->next_tag(), + 'Should have found a tag but, did not.' + ); + $this->assertFalse( $processor->next_tag() ); + + $this->assertTrue( + $processor->paused_at_incomplete_token(), + "Should have indicated that the parser found an incomplete token but didn't." + ); + } + + /** * The string " -- " (double-hyphen) must not occur within comments. * From 76f335905124ea8e0153a0da4b51e2d9625a9165 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 6 Jun 2024 16:48:14 +0200 Subject: [PATCH 8/8] Bugfixes, update entity decoder --- .../html-api/class-wp-xml-decoder.php | 136 +++++++----------- .../html-api/class-wp-xml-processor.php | 2 +- .../html-api/class-wp-xml-tag-processor.php | 1 + 3 files changed, 50 insertions(+), 89 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-xml-decoder.php b/src/wp-includes/html-api/class-wp-xml-decoder.php index bd5159a7397b3..45e48e115bac9 100644 --- a/src/wp-includes/html-api/class-wp-xml-decoder.php +++ b/src/wp-includes/html-api/class-wp-xml-decoder.php @@ -36,104 +36,64 @@ public static function decode( $text ) { if ( false === $next_character_reference_at || $next_character_reference_at >= $end ) { break; } - // $next_character_reference_at += 1; - /* - * Capture all bytes that could form a character reference. - * - * This only supports: - * - * * The five mandated character references, that is & < > " ' - * * Numeric character references, e.g. { or  - * - * XML grammar rule for parsing numeric references is: - * - * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' [WFC: Legal Character] - * - * See https://www.w3.org/TR/xml/#NT-CharRef - */ - $token_length = strspn( - $text, - 'ampltgquos#xX0123456789bcdefABCDEF;', - $next_character_reference_at + 1, - /* - * Limit the length of the token to avoid scanning the entire document in case - * a semicolon is missing. - * - * The maximum supported code point is 10FFFF, which is 9 characters long when - * represented as either decimal or hexadecimal numeric character reference entity. - * Technically, you can also add zeros to the front of the entity, which makes the - * string longer, for example FFFF; - * - * We limit this scan to 30 characters, which allows twenty zeros at the front. - */ - 30 - ); + if ( '#' === $text[ $next_character_reference_at + 1 ] ) { + $is_hex = 'x' === $text[ $next_character_reference_at + 2 ] || 'X' === $text[ … ]; + $zeros_start_at = $next_character_reference_at + 3 + ( $is_hex ? 1 : 0 ); + $zeros_length = strspn( $text, '0', $zeros_start_at ); + $digits_start_at = $zeros_start_at + $zeros_length; + $digit_chars = $is_hex ? '0123456789abcdefABCDEF' : '0123456789'; + $digits_length = strspn( $text, $digit_chars, $digits_start_at ); + $semicolon_at = $digits_start_at + $digits_length; + + // Must be followed by a semicolon. + if ( ';' !== $text[ $semicolon_at ] ) { + return false; + } - if ( false === $token_length ) { - return null; - } + // Null bytes cannot be encoded in XML. + if ( 0 === $digits_length ) { + return false; + } - if ( ';' !== $text[ $next_character_reference_at + 1 + $token_length - 1 ] ) { /* - * In XML, all character references must end with a semicolon. + * Must encode a valid Unicode code point. + * (Avoid parsing more than is necessary). */ - return null; + $max_digits = $is_hex ? 6 : 7; + if ( $digits_length > $max_digits ) { + return false; + } + + $base = $is_hex ? 16 : 10; + $code_point = intval( substr( $text, $digits_start_at, $digits_length ), $base ); + if ( if_allowable_code_point( $code_point ) ) { + $decoded .= WP_HTML_Decoder::code_point_to_utf8_bytes( $code_point ); + $at = $semicolon_at + 1; + continue; + } + + return false; } - $token = strtolower( substr( $text, $next_character_reference_at + 1, $token_length - 1 ) ); + // Must be a named character reference. + $name_starts_at = $next_character_reference_at + 1; - if ( 'amp' === $token ) { - $character_reference = '&'; - } elseif ( 'lt' === $token ) { - $character_reference = '<'; - } elseif ( 'gt' === $token ) { - $character_reference = '>'; - } elseif ( 'quot' === $token ) { - $character_reference = '"'; - } elseif ( 'apos' === $token ) { - $character_reference = "'"; - } else { - $code_point = self::parse_code_point( $text, $next_character_reference_at ); - if ( null === $code_point ) { - /* - * > The following are forbidden, and constitute fatal errors: - * > * the appearance of a reference to an unparsed entity, except in the EntityValue in an entity declaration. - * - * See https://www.w3.org/TR/xml/#forbidden - */ - return null; - } - $character_reference = WP_HTML_Decoder::code_point_to_utf8_bytes( $code_point ); - if ( - '�' === $character_reference && - 0xFFFD !== $code_point - ) { - /* - * Stop processing if we got an invalid character AND the reference does not - * specifically refer code point FFFD (�). - * - * > It is a fatal error when an XML processor encounters an entity with an - * > encoding that it is unable to process. It is a fatal error if an XML entity - * > is determined (via default, encoding declaration, or higher-level protocol) - * > to be in a certain encoding but contains byte sequences that are not legal - * > in that encoding. Specifically, it is a fatal error if an entity encoded in - * > UTF-8 contains any ill-formed code unit sequences, as defined in section - * > 3.9 of Unicode [Unicode]. Unless an encoding is determined by a higher-level - * > protocol, it is also a fatal error if an XML entity contains no encoding - * > declaration and its content is not legal UTF-8 or UTF-16. - * - * See https://www.w3.org/TR/xml/#charencoding - */ - return null; + $standard_entities = array( + 'amp;' => '&', + 'apos;' => "'", + 'gt;' => '>', + 'lt;' => '<', + 'quot;' => '"', + ); + + foreach ( $standard_entities as $name => $replacement ) { + if ( substr_compare( $text, $name, $name_starts_at, strlen( $name ) ) ) { + $decoded .= $replacement; + $at = $name_starts_at + strlen( $name ); + break; } } - - $at = $next_character_reference_at; - $decoded .= substr( $text, $was_at, $at - $was_at ); - $decoded .= $character_reference; - $at += $token_length + 1; - $was_at = $at; } if ( 0 === $was_at ) { diff --git a/src/wp-includes/html-api/class-wp-xml-processor.php b/src/wp-includes/html-api/class-wp-xml-processor.php index 1906ce3df7cb2..a18e35fd77008 100644 --- a/src/wp-includes/html-api/class-wp-xml-processor.php +++ b/src/wp-includes/html-api/class-wp-xml-processor.php @@ -50,7 +50,7 @@ class WP_XML_Processor extends WP_XML_Tag_Processor { */ public $stack_of_open_elements = array(); - public static function stream_next_xml_token( $input_stream, $output_stream, $buffer_size = 8092 ) { + public static function stream_tokens( $input_stream, $output_stream, $buffer_size = 8092 ) { $streamed_data = fread( $input_stream, $buffer_size ); $breadcrumbs = array(); diff --git a/src/wp-includes/html-api/class-wp-xml-tag-processor.php b/src/wp-includes/html-api/class-wp-xml-tag-processor.php index d3982758282b7..db06e54b74323 100644 --- a/src/wp-includes/html-api/class-wp-xml-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-xml-tag-processor.php @@ -2314,6 +2314,7 @@ public function get_modifiable_text() { */ $this->last_error = self::ERROR_SYNTAX; + var_dump( $text ); _doing_it_wrong( __METHOD__, __( 'Invalid text content encountered.' ),