Skip to content

Commit

Permalink
Block Parser: Explore a streaming lazy interface
Browse files Browse the repository at this point in the history
For a 3 MB document which took 5 seconds and 14 GB to parse,
this version of the parser parsed it in 41 ms and 40 MB.
  • Loading branch information
dmsnell committed Nov 26, 2023
1 parent 5ff4794 commit 20e8c0b
Showing 1 changed file with 323 additions and 0 deletions.
323 changes: 323 additions & 0 deletions src/wp-includes/class-wp-block-parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,329 @@
* @package WordPress
*/

class WP_Span {
public $at;
public $length;

public function __construct( $at, $length ) {
$this->at = $at;
$this->length = $length;
}
}

class WP_Linked_List_Item {
/**
* Data cell.
*
* @var mixed
*/
public $data;

/**
* Pointer to next element in list.
*
* @var mixed
*/
public $next;

public function __construct( $data ) {
$this->data = $data;
}
}

class WP_Linked_List {
/**
* Front of list.
*
* @var WP_Linked_List_Item|null
*/
public $head;

/**
* Last item of list; an optimization.
*
* @var WP_Linked_List_Item|null
*/
public $last;

public $length = 0;

public function append( $data ) {
$this->length++;

if ( null === $this->head ) {
$this->head = new WP_Linked_List_Item( $data );
$this->last = $this->head;
return;
}

$item = new WP_Linked_List_Item( $data );
$this->last->next = $item;
$this->last = $item;
}

public function items() {
$item = $this->head;
while ( $item ) {
yield $item->data;
$item = $item->next;
}
}
}

class WP_Parsed_Block implements ArrayAccess {
/**
* Offset into name list where block name starts.
*
* @var int
*/
public $name_at;

/**
* Source of block attributes
*
* @var WP_Span|null
*/
public $attrs;

/**
* List of inner content.
*
* @var WP_Linked_List|null
*/
public $inner_content;

/**
* @var WP_Parsed_Blocks
*/
public $post;

public function __construct( $post ) {
$this->post = $post;
}

public function offsetGet( mixed $offset ): mixed {
switch ( $offset ) {
case 'blockName':
$name_end = strpos( $this->post->seen_block_types, "\x00", $this->name_at );
return substr( $this->post->seen_block_types, $this->name_at, $name_end - $this->name_at );

case 'attrs':
if ( null === $this->attrs ) {
return null;
}

return json_decode( substr( $this->post->html, $this->attrs->at, $this->attrs->length ) );

case 'inner_content':
if ( null === $this->inner_content ) {
return [];

Check failure on line 124 in src/wp-includes/class-wp-block-parser.php

View workflow job for this annotation

GitHub Actions / PHP coding standards

Short array syntax is not allowed
}

return $this->inner_content->items();
}
}

public function offsetExists( mixed $offset ): bool {
// TODO: Implement offsetExists() method.
}

public function offsetSet( mixed $offset, mixed $value ): mixed {
// TODO: Implement offsetSet() method.
}

public function offsetUnset( mixed $offset ): void {

Check failure on line 139 in src/wp-includes/class-wp-block-parser.php

View workflow job for this annotation

GitHub Actions / Check PHP compatibility

void return type is not present in PHP version 7.0 or earlier
// TODO: Implement offsetUnset() method.
}
}

class WP_Block_Stack_Item {
/**
* Data cell.
*
* @var WP_Parsed_Block
*/
public $data;

/**
* Parent of this item, or null if top-node.
*
* @var WP_Block_Stack_Item|null
*/
public $parent;

/**
* Child of this item, or null if leaf-node.
*
* @var WP_Block_Stack_Item|null
*/
public $child;
}

class WP_Parsed_Blocks {
/**
* Original HTML from which the blocks were parsed.
*
* @var string
*/
public $html;

/**
* Tracks internal pointer into HTML.
*
* @var int
*/
public $at = 0;

/**
* Concatenated block names, as parsed. Used for quick lookup
* of existing names.
*
* @var string
*/
public $seen_block_types = "\x00";

/**
* Tracks blocks while parsing.
*
* @var WP_Block_Stack
*/
public $stack;

/**
* @var WP_Parsed_Block
*/
public $root;

public function __construct( $html ) {
$this->html = $html;
$this->stack = new WP_Block_Stack();
$this->root = new WP_Parsed_Block( $this );
$this->root->inner_content = new WP_Linked_List();
$this->stack->open( $this->root );
}

/**
* Generator function which returns each block and the stack as it parses.
*/
public function step() {
if ( $this->at >= strlen( $this->html ) ) {
return false;
}

$has_match = preg_match(
'/<!--\s+(?P<closer>\/)?wp:(?P<namespace>[a-z][a-z0-9_-]*\/)?(?P<name>[a-z][a-z0-9_-]*)\s+(?P<attrs>{(?:(?:[^}]+|}+(?=})|(?!}\s+\/?-->).)*+)?}\s+)?(?P<void>\/)?-->/s',
$this->html,
$matches,
PREG_OFFSET_CAPTURE,
$this->at
);

if ( ! $has_match ) {
$this->at = strlen( $this->html );
return false;
}

list( $match, $started_at ) = $matches[0];

$length = strlen( $match );
$is_closer = isset( $matches['closer'] ) && -1 !== $matches['closer'][1];
$is_void = isset( $matches['void'] ) && -1 !== $matches['void'][1];
$namespace = $matches['namespace'];
$namespace = ( isset( $namespace ) && -1 !== $namespace[1] ) ? $namespace[0] : 'core/';
$name = $namespace . $matches['name'][0];
$has_attrs = isset( $matches['attrs'] ) && -1 !== $matches['attrs'][1];

if ( $started_at > $this->at ) {
$this->stack->add_inner_chunk( new WP_Span( $this->at, $started_at - $this->at ) );
}

$this->at = $started_at + strlen( $match );

if ( $is_closer ) {
$this->stack->close();
return true;
}

$block = new WP_Parsed_Block( $this );

// Block name
$name_search = "\x00{$name}\x00";
$seen_name_at = strpos( $this->seen_block_types, $name_search );
if ( false === $seen_name_at ) {
$block->name_at = strlen( $this->seen_block_types );
$this->seen_block_types .= "{$name}\x00";
} else {
$block->name_at = $seen_name_at + 1;
}

// Block attrs
if ( $has_attrs ) {
$block->attrs = new WP_Span( $matches['attrs'][1], strlen( $matches['attrs'][0] ) );
}

$this->stack->add_inner_chunk( $block );

if ( ! $is_void ) {
$this->stack->open( $block );
}
return true;
}
}

class WP_Block_Stack {
/**
* Bottom of stack of items.
*
* @var WP_Block_Stack_Item|null
*/
public $stack;

/**
* Size of stack.
*
* @var int
*/
public $depth = 0;

public function open( $data ) {
$child = new WP_Block_Stack_Item();
$child->data = $data;

if ( null === $this->stack ) {
$this->stack = $child;
$this->depth++;
return;
}

$child->parent = $this->stack;

$this->stack->child = $child;
$this->stack = $child;
$this->depth++;
}

public function close() {
if ( null === $this->stack ) {
return null;
}

$item = $this->stack->data;
$this->stack = $this->stack->parent;
$this->depth--;

return $item;
}

public function add_inner_chunk( $data ) {
if ( null === $this->stack->data->inner_content ) {
$this->stack->data->inner_content = new WP_Linked_List();
}

$this->stack->data->inner_content->append( $data );
}
}

/**
* Class WP_Block_Parser
*
Expand Down

0 comments on commit 20e8c0b

Please sign in to comment.