Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HTML API: Refactor make_clickable() #7450

Draft
wants to merge 2 commits into
base: trunk
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 144 additions & 63 deletions src/wp-includes/formatting.php
Original file line number Diff line number Diff line change
Expand Up @@ -3052,87 +3052,168 @@
/**
* Converts plaintext URI to HTML links.
*
* Converts URI, www and ftp, and email addresses. Finishes by fixing links
* within links.
* Converts URI, www and ftp, and email addresses.
*
* @since 0.71
* @since 6.7.0 Internal logic replaced with the HTML API.
*
* @param string $text Content to convert URIs.
* @return string Content with converted URIs.
*/
function make_clickable( $text ) {
$r = '';
$textarr = preg_split( '/(<[^<>]+>)/', $text, -1, PREG_SPLIT_DELIM_CAPTURE ); // Split out HTML tags.
$nested_code_pre = 0; // Keep track of how many levels link is nested inside <pre> or <code>.
foreach ( $textarr as $piece ) {

if ( preg_match( '|^<code[\s>]|i', $piece )
|| preg_match( '|^<pre[\s>]|i', $piece )
|| preg_match( '|^<script[\s>]|i', $piece )
|| preg_match( '|^<style[\s>]|i', $piece )
) {
++$nested_code_pre;
} elseif ( $nested_code_pre
&& ( '</code>' === strtolower( $piece )
|| '</pre>' === strtolower( $piece )
|| '</script>' === strtolower( $piece )
|| '</style>' === strtolower( $piece )
)
) {
--$nested_code_pre;
}
$replacements = array();
$processor = WP_HTML_Processor::create_fragment( $text );

if ( $nested_code_pre
|| empty( $piece )
|| ( '<' === $piece[0] && ! preg_match( '|^<\s*[\w]{1,20}+://|', $piece ) )
) {
$r .= $piece;
continue;
}
/**
* Whether the currently-matched tag is inside an open `A` element.
*
* Since `A` elements cannot nest, there's no need to track a stack
* of elements and determine if `A` is inside them, or track nesting
* levels. The HTML Processor ensures proper parsing.
*/
$in_link = false;

$linkify = static function ( $ret ) use ( $replacements ) {
$at = 0;
$end = strlen( $ret );
$was_at = $at;

while ( $at < $end ) {
$separator_at = $at + strcspn( $ret, ':@.', $at );

// Long strings might contain expensive edge cases...
if ( 10000 < strlen( $piece ) ) {
// ...break it up.
foreach ( _split_str_by_whitespace( $piece, 2100 ) as $chunk ) { // 2100: Extra room for scheme and leading and trailing parentheses.
if ( 2101 < strlen( $chunk ) ) {
$r .= $chunk; // Too big, no whitespace: bail.
} else {
$r .= make_clickable( $chunk );
}
}
} else {
$ret = " $piece "; // Pad with whitespace to simplify the regexes.

$url_clickable = '~
([\\s(<.,;:!?]) # 1: Leading whitespace, or punctuation.
( # 2: URL.
[\\w]{1,20}+:// # Scheme and hier-part prefix.
(?=\S{1,2000}\s) # Limit to URLs less than about 2000 characters long.
[\\w\\x80-\\xff#%\\~/@\\[\\]*(+=&$-]*+ # Non-punctuation URL character.
(?: # Unroll the Loop: Only allow punctuation URL character if followed by a non-punctuation URL character.
[\'.,;:!?)] # Punctuation URL character.
[\\w\\x80-\\xff#%\\~/@\\[\\]*(+=&$-]++ # Non-punctuation URL character.
)*
)
(\)?) # 3: Trailing closing parenthesis (for parenthesis balancing post processing).
~xS';
/*
* The regex is a non-anchored pattern and does not have a single fixed starting character.
* Tell PCRE to spend more time optimizing since, when used on a page load, it will probably be used several times.
* Detect proper URLs.
*
* @see https://url.spec.whatwg.org/#scheme-start-state
*/
if ( 0 === substr_compare( $ret, '://', $separator_at, 3 ) ) {
// Find the schema.
$start_of_schema = $was_at;
$schema_length = 0;

while ( ( $start_of_schema + $schema_length ) < $separator_at ) {
$schema_length = strspn(
$ret,
'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-.',
$start_of_schema,
$separator_at - $start_of_schema
);

if ( 0 === $schema_length ) {
++$start_of_schema;
continue;
}

$unwanted_prefix = strspn( $ret, '0123456789+-.', $start_of_schema );
$start_of_schema += $unwanted_prefix;
$schema_length -= $unwanted_prefix;

if ( ( $start_of_schema + $schema_length ) < $separator_at ) {
$start_of_schema += $schema_length;
$schema_length = 0;
}
}

$schema = strtolower( substr( $ret, $start_of_schema, $schema_length ) );
$is_special = in_array( $schema, array( 'ftp', 'file', 'http', 'https', 'ws', 'wss' ), true );

// Find the host, ignore if authority is included.
$host_at = $separator_at + 2;
$host_length = strcspn( $ret, '/?#\\', $host_at, $end - $host_at );

if ( 0 === $host_length || strcspn( $ret, '@', $host_at, $host_length ) !== $host_length ) {
// @todo: Might need to parse this fully and then reject to avoid matching next on its fragment.
$at = $separator_at + 1;
continue;
}

$ret = preg_replace_callback( $url_clickable, '_make_url_clickable_cb', $ret );
if ( strcspn( $ret, "\x00\t\f\n #/:<>?@[\\]^|", $host_at, $host_length ) !== $host_length ) {
// @todo: Might need to parse this fully and then reject to avoid matching next on its fragment.
$at = $separator_at + 1;
continue;
}

$ret = preg_replace_callback( '#([\s>])((www|ftp)\.[\w\\x80-\\xff\#$%&~/.\-;:=,?@\[\]+]+)#is', '_make_web_ftp_clickable_cb', $ret );
$ret = preg_replace_callback( '#([\s>])([.0-9a-z_+-]+)@(([0-9a-z-]+\.)+[0-9a-z]{2,})#i', '_make_email_clickable_cb', $ret );
// @todo Path parsing.
$path_at = $host_at + $host_length;
$end_match = null;
if ( 0 === preg_match(
'~(?>![[a-zA-Z0-9!$&\'()*+,-./:;=?@_~\x{00A0}-\x{10FFFD}]).~u',

Check failure on line 3140 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Multi-line function call not indented correctly; expected 20 spaces but found 24
$ret,

Check failure on line 3141 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Multi-line function call not indented correctly; expected 20 spaces but found 24
$end_match,

Check failure on line 3142 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Multi-line function call not indented correctly; expected 20 spaces but found 24
PREG_OFFSET_CAPTURE,

Check failure on line 3143 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Multi-line function call not indented correctly; expected 20 spaces but found 24
$host_at

Check failure on line 3144 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Multi-line function call not indented correctly; expected 20 spaces but found 24
) ) {

Check failure on line 3145 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Multi-line function call not indented correctly; expected 16 spaces but found 20
return $replacements;
}
}

$ret = substr( $ret, 1, -1 ); // Remove our whitespace padding.
$r .= $ret;
$was_at = $separator_at;
$at = $separator_at + 1;
}

$url_clickable = '~
([\s(<.,;:!?]) # 1: Leading whitespace, or punctuation.
( # 2: URL.
\w{1,20}+:// # Scheme and hier-part prefix.
(?=\S{1,2000}\s) # Limit to URLs less than about 2000 characters long.
[\w\x80-\xff#%\~/@\[\]*(+=&$-]*+ # Non-punctuation URL character.
(?: # Unroll the Loop: Only allow punctuation URL character if followed by a non-punctuation URL character.
[\'.,;:!?)] # Punctuation URL character.
[\w\x80-\xff#%\~/@\[\]*(+=&$-]++ # Non-punctuation URL character.
)*
)
(\)?) # 3: Trailing closing parenthesis (for parenthesis balancing post processing).
~xS';

/*
* The regex is a non-anchored pattern and does not have a single fixed starting character.
* Tell PCRE to spend more time optimizing since, when used on a page load, it will probably be used several times.
*/

$ret = preg_replace_callback($url_clickable, '_make_url_clickable_cb', $ret);

Check failure on line 3173 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Expected 1 spaces after opening parenthesis; 0 found

Check failure on line 3173 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Expected 1 spaces before closing parenthesis; 0 found
$ret = preg_replace_callback('#([\s>])((www|ftp)\.[\w\x80-\xff\#$%&~/.\-;:=,?@\[\]+]+)#is', '_make_web_ftp_clickable_cb', $ret);

Check failure on line 3174 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Expected 1 spaces after opening parenthesis; 0 found

Check failure on line 3174 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Expected 1 spaces before closing parenthesis; 0 found
$ret = preg_replace_callback( '#([\s>])([.0-9a-z_+-]+)@(([0-9a-z-]+\.)+[0-9a-z]{2,})#i', '_make_email_clickable_cb', $ret );

// $ret = substr( $ret, 1, -1 ); // Remove our whitespace padding.

return $ret;
};

while ( $processor->next_token() ) {
switch ( $processor->get_token_name() ) {
// Skip link generation when inside existing `A` elements.
case 'A':
$in_link = ! $processor->is_tag_closer();
break;

// Skip link generation within these tags.
case 'CODE':
case 'PRE':
$depth = $processor->get_current_depth();
while ( $processor->get_current_depth() >= $depth && $processor->next_token() ) {
continue;
}
break;

case '#text':
if ( $in_link ) {
break;
}

// Generate new links.
$chunk = $processor->get_modifiable_text();
$linked = $linkify( $chunk );
if ( $chunk !== $linked ) {
echo "\e[90mReplaced:\e[m\n";
echo implode( '', array_map( fn ( $line ) => "\e[0;31m- \e[3;34m{$line}\e[m\n", explode( "\n", $chunk ) ) );
echo implode( '', array_map( fn ( $line ) => "\e[0;32m+ \e[3;34m{$line}\e[m\n", explode( "\n", $linked ) ) );
}
break;
}
}

// Cleanup of accidental links within links.
return preg_replace( '#(<a([ \r\n\t]+[^>]+?>|>))<a [^>]+?>([^>]+?)</a></a>#i', '$1$3</a>', $r );
return $text;
}

/**
Expand Down
Loading