Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

XML Stream Processing – Proof of concept #43

Draft
wants to merge 10 commits into
base: xml-processor
Choose a base branch
from
136 changes: 48 additions & 88 deletions src/wp-includes/html-api/class-wp-xml-decoder.php
Original file line number Diff line number Diff line change
Expand Up @@ -36,104 +36,64 @@ public static function decode( $text ) {
if ( false === $next_character_reference_at || $next_character_reference_at >= $end ) {
break;
}
// $next_character_reference_at += 1;

/*
* Capture all bytes that could form a character reference.
*
* This only supports:
*
* * The five mandated character references, that is & < > " '
* * Numeric character references, e.g. { or 
*
* XML grammar rule for parsing numeric references is:
*
* [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' [WFC: Legal Character]
*
* See https://www.w3.org/TR/xml/#NT-CharRef
*/
$token_length = strspn(
$text,
'ampltgquos#xX0123456789bcdefABCDEF;',
$next_character_reference_at + 1,
/*
* Limit the length of the token to avoid scanning the entire document in case
* a semicolon is missing.
*
* The maximum supported code point is 10FFFF, which is 9 characters long when
* represented as either decimal or hexadecimal numeric character reference entity.
* Technically, you can also add zeros to the front of the entity, which makes the
* string longer, for example &#00000010FFFF;
*
* We limit this scan to 30 characters, which allows twenty zeros at the front.
*/
30
);
if ( '#' === $text[ $next_character_reference_at + 1 ] ) {
$is_hex = 'x' === $text[ $next_character_reference_at + 2 ] || 'X' === $text[ … ];
$zeros_start_at = $next_character_reference_at + 3 + ( $is_hex ? 1 : 0 );
$zeros_length = strspn( $text, '0', $zeros_start_at );
$digits_start_at = $zeros_start_at + $zeros_length;
$digit_chars = $is_hex ? '0123456789abcdefABCDEF' : '0123456789';
$digits_length = strspn( $text, $digit_chars, $digits_start_at );
$semicolon_at = $digits_start_at + $digits_length;

// Must be followed by a semicolon.
if ( ';' !== $text[ $semicolon_at ] ) {
return false;
}

if ( false === $token_length ) {
return null;
}
// Null bytes cannot be encoded in XML.
if ( 0 === $digits_length ) {
return false;
}

if ( ';' !== $text[ $next_character_reference_at + 1 + $token_length - 1 ] ) {
/*
* In XML, all character references must end with a semicolon.
* Must encode a valid Unicode code point.
* (Avoid parsing more than is necessary).
*/
return null;
$max_digits = $is_hex ? 6 : 7;
if ( $digits_length > $max_digits ) {
return false;
}

$base = $is_hex ? 16 : 10;
$code_point = intval( substr( $text, $digits_start_at, $digits_length ), $base );
if ( if_allowable_code_point( $code_point ) ) {
$decoded .= WP_HTML_Decoder::code_point_to_utf8_bytes( $code_point );
$at = $semicolon_at + 1;
continue;
}

return false;
}

$token = strtolower( substr( $text, $next_character_reference_at + 1, $token_length - 1 ) );
// Must be a named character reference.
$name_starts_at = $next_character_reference_at + 1;

if ( 'amp' === $token ) {
$character_reference = '&';
} elseif ( 'lt' === $token ) {
$character_reference = '<';
} elseif ( 'gt' === $token ) {
$character_reference = '>';
} elseif ( 'quot' === $token ) {
$character_reference = '"';
} elseif ( 'apos' === $token ) {
$character_reference = "'";
} else {
$code_point = self::parse_code_point( $text, $next_character_reference_at );
if ( null === $code_point ) {
/*
* > The following are forbidden, and constitute fatal errors:
* > * the appearance of a reference to an unparsed entity, except in the EntityValue in an entity declaration.
*
* See https://www.w3.org/TR/xml/#forbidden
*/
return null;
}
$character_reference = WP_HTML_Decoder::code_point_to_utf8_bytes( $code_point );
if (
'�' === $character_reference &&
0xFFFD !== $code_point
) {
/*
* Stop processing if we got an invalid character AND the reference does not
* specifically refer code point FFFD (�).
*
* > It is a fatal error when an XML processor encounters an entity with an
* > encoding that it is unable to process. It is a fatal error if an XML entity
* > is determined (via default, encoding declaration, or higher-level protocol)
* > to be in a certain encoding but contains byte sequences that are not legal
* > in that encoding. Specifically, it is a fatal error if an entity encoded in
* > UTF-8 contains any ill-formed code unit sequences, as defined in section
* > 3.9 of Unicode [Unicode]. Unless an encoding is determined by a higher-level
* > protocol, it is also a fatal error if an XML entity contains no encoding
* > declaration and its content is not legal UTF-8 or UTF-16.
*
* See https://www.w3.org/TR/xml/#charencoding
*/
return null;
$standard_entities = array(
'amp;' => '&',
'apos;' => "'",
'gt;' => '>',
'lt;' => '<',
'quot;' => '"',
);

foreach ( $standard_entities as $name => $replacement ) {
if ( substr_compare( $text, $name, $name_starts_at, strlen( $name ) ) ) {
$decoded .= $replacement;
$at = $name_starts_at + strlen( $name );
break;
}
}

$at = $next_character_reference_at;
$decoded .= substr( $text, $was_at, $at - $was_at );
$decoded .= $character_reference;
$at += $token_length + 1;
$was_at = $at;
}

if ( 0 === $was_at ) {
Expand Down
48 changes: 46 additions & 2 deletions src/wp-includes/html-api/class-wp-xml-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,59 @@ class WP_XML_Processor extends WP_XML_Tag_Processor {
*/
public $stack_of_open_elements = array();

public static function stream_tokens( $input_stream, $output_stream, $buffer_size = 8092 ) {
$streamed_data = fread( $input_stream, $buffer_size );

$breadcrumbs = array();
$parser_context = WP_XML_Processor::IN_PROLOG_CONTEXT;
$processor = new WP_XML_Processor( $streamed_data, $breadcrumbs, $parser_context );
while ( true ) {
$token_found = $processor->next_token();
$processor->get_updated_xml();

if ( $processor->paused_at_incomplete_token() ) {
fwrite( $output_stream, $processor->get_processed_xml() );

$next_chunk = fread( $input_stream, $buffer_size );
if ( $next_chunk === false ) {
$next_chunk = '';
}

$processor = new WP_XML_Processor(
$processor->get_unprocessed_xml() . $next_chunk,
$processor->get_breadcrumbs(),
$processor->get_parser_context()
);
// To make sure <?xml tokens won't be treated as XML declaration
// but as processing instructions
$processor->had_previous_chunks = true;
continue;
} elseif ( null !== $processor->get_last_error() ) {
throw new Exception( $processor->get_last_error() );
} elseif ( ! $token_found ) {
fwrite( $output_stream, $processor->get_updated_xml() );
// finished
return true;
}
yield $processor;
}
}

/**
* Constructor.
*
* @since WP_VERSION
*
* @param string $xml XML to process.
*/
public function __construct( $xml ) {
public function __construct( $xml, $breadcrumbs = array(), $parser_context = self::IN_PROLOG_CONTEXT ) {
parent::__construct( $xml );
$this->stack_of_open_elements = $breadcrumbs;
$this->parser_context = $parser_context;
}

public function get_parser_context() {
return $this->parser_context;
}

/**
Expand Down Expand Up @@ -149,7 +193,7 @@ public function next_tag( $query = null ) {
return false;
}

/**
/*
* Sets a bookmark in the XML document.
*
* Bookmarks represent specific places or tokens in the HTML
Expand Down
56 changes: 53 additions & 3 deletions src/wp-includes/html-api/class-wp-xml-tag-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,8 @@ class WP_XML_Tag_Processor {
*/
protected $seek_count = 0;

public $had_previous_chunks = false;

/**
* Constructor.
*
Expand Down Expand Up @@ -1358,6 +1360,7 @@ private function parse_next_tag() {
*/
if (
0 === $at &&
! $this->had_previous_chunks &&
! $this->is_closing_tag &&
'?' === $xml[ $at + 1 ] &&
'x' === $xml[ $at + 2 ] &&
Expand Down Expand Up @@ -1483,8 +1486,13 @@ private function parse_next_tag() {
! $this->is_closing_tag &&
'?' === $xml[ $at + 1 ]
) {
if ( $at + 4 >= $doc_length ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;

return false;
}

if ( ! (
$at + 4 <= $doc_length &&
( 'x' === $xml[ $at + 2 ] || 'X' === $xml[ $at + 2 ] ) &&
( 'm' === $xml[ $at + 3 ] || 'M' === $xml[ $at + 3 ] ) &&
( 'l' === $xml[ $at + 4 ] || 'L' === $xml[ $at + 4 ] )
Expand Down Expand Up @@ -1548,7 +1556,6 @@ private function parse_next_attribute() {
$this->bytes_already_parsed += strspn( $this->xml, " \t\f\r\n/", $this->bytes_already_parsed );
if ( $this->bytes_already_parsed >= strlen( $this->xml ) ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;

return false;
}

Expand Down Expand Up @@ -1576,7 +1583,6 @@ private function parse_next_attribute() {
$this->skip_whitespace();
if ( $this->bytes_already_parsed >= strlen( $this->xml ) ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;

return false;
}
switch ( $this->xml[ $this->bytes_already_parsed ] ) {
Expand Down Expand Up @@ -1855,6 +1861,19 @@ public function has_bookmark( $bookmark_name ) {
return array_key_exists( $bookmark_name, $this->bookmarks );
}

public function get_processed_xml() {
// Flush updates
$this->get_updated_xml();
return substr( $this->xml, 0, $this->bytes_already_parsed );
}

public function get_unprocessed_xml() {
// Flush updates
$this->get_updated_xml();
return substr( $this->xml, $this->bytes_already_parsed );
}


/**
* Move the internal cursor in the Tag Processor to a given bookmark's location.
*
Expand Down Expand Up @@ -2295,6 +2314,7 @@ public function get_modifiable_text() {
*/

$this->last_error = self::ERROR_SYNTAX;
var_dump( $text );
_doing_it_wrong(
__METHOD__,
__( 'Invalid text content encountered.' ),
Expand All @@ -2305,6 +2325,36 @@ public function get_modifiable_text() {
return $decoded;
}

public function set_modifiable_text( $new_value ) {
switch ( $this->parser_state ) {
case self::STATE_TEXT_NODE:
case self::STATE_COMMENT:
$this->lexical_updates[] = new WP_HTML_Text_Replacement(
$this->text_starts_at,
$this->text_length,
// @TODO This is naive, let's rethink this.
htmlspecialchars( $new_value, ENT_XML1, 'UTF-8' )
);
return true;

case self::STATE_CDATA_NODE:
$this->lexical_updates[] = new WP_HTML_Text_Replacement(
$this->text_starts_at,
$this->text_length,
// @TODO This is naive, let's rethink this.
str_replace( ']]>', ']]&gt;', $new_value )
);
return true;
default:
_doing_it_wrong(
__METHOD__,
__( 'Cannot set text content on a non-text node.' ),
'WP_VERSION'
);
return false;
}
}

/**
* Updates or creates a new attribute on the currently matched tag with the passed value.
*
Expand Down
Loading