Skip to content

Commit

Permalink
Merge pull request #354 from j0k3r/wip/jtojnar/phpstan-fixes
Browse files Browse the repository at this point in the history
PHPStan fixes
  • Loading branch information
j0k3r authored Oct 8, 2024
2 parents ecfcfb1 + 8bb8da1 commit aade65f
Show file tree
Hide file tree
Showing 19 changed files with 628 additions and 100 deletions.
10 changes: 9 additions & 1 deletion .php-cs-fixer.php
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,17 @@
'ordered_imports' => true,
'php_unit_strict' => true,
'phpdoc_order' => true,
'phpdoc_to_param_type' => ['union_types' => false],
// This breaks `Monolog\Formatter\GrabyFormatter::convertToString()`.
// 'phpdoc_to_param_type' => ['union_types' => false],
'phpdoc_to_return_type' => ['union_types' => false],
'phpdoc_to_property_type' => ['union_types' => false],
'no_superfluous_phpdoc_tags' => [
// Copied from Symfony
'allow_hidden_params' => true,
'remove_inheritdoc' => true,
// We want this for PHPStan and we cannot use `mixed` type hint because it requires PHP 8.
'allow_mixed' => true,
],
// 'psr4' => true,
'strict_comparison' => true,
'strict_param' => true,
Expand Down
39 changes: 39 additions & 0 deletions maintenance/Rector/MockGrabyResponseRector.php
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,45 @@ private function createNewResponseExpression(ResponseInterface $response, string
/**
* Runs Graby for given URL and returns HTTP responses by the client.
*
* @param array{
* debug?: bool,
* log_level?: 'info'|'debug',
* rewrite_relative_urls?: bool,
* singlepage?: bool,
* multipage?: bool,
* error_message?: string,
* error_message_title?: string,
* allowed_urls?: string[],
* blocked_urls?: string[],
* xss_filter?: bool,
* content_type_exc?: array<string, array{name: string, action: 'link'|'exclude'}>,
* content_links?: 'preserve'|'footnotes'|'remove',
* http_client?: array{
* ua_browser?: string,
* default_referer?: string,
* rewrite_url?: array<array<string, string>>,
* header_only_types?: array<string>,
* header_only_clues?: array<string>,
* user_agents?: array<string, string>,
* ajax_triggers?: array<string>,
* max_redirect?: int,
* },
* extractor?: array{
* default_parser?: string,
* fingerprints?: array<string, string>,
* config_builder?: array{
* site_config?: string[],
* hostname_regex?: string,
* },
* readability?: array{
* pre_filters?: array<string, string>,
* post_filters?: array<string, string>,
* },
* src_lazy_load_attributes?: string[],
* json_ld_ignore_types?: string[],
* },
* } $config
*
* @return ResponseInterface[]
*/
private function fetchResponses(string $url, array $config): array
Expand Down
12 changes: 0 additions & 12 deletions phpstan.neon
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,5 @@ parameters:
-
message: '#\$innerHTML#'
path: %currentWorkingDirectory%
# other stuff I might have fucked up with DOM* classes
-
message: '#DOMNode::\$tagName#'
path: %currentWorkingDirectory%/src/
-
message: '#DOMNode::getElementsByTagName#'
path: %currentWorkingDirectory%/src/
-
message: '#expects DOMElement, DOMNode given#'
path: %currentWorkingDirectory%/src/Graby.php
-
identifier: missingType.iterableValue

inferPrivatePropertyTypeFromConstructor: true
27 changes: 14 additions & 13 deletions src/Extractor/ContentExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ class ContentExtractor
private ?string $language = null;
/** @var string[] */
private array $authors = [];
/** @var \DOMElement|\DOMNode|null */
private $body = null;
private ?\DOMElement $body = null;
private ?string $image = null;
private bool $nativeAd = false;
private ?string $date = null;
Expand All @@ -42,7 +41,10 @@ class ContentExtractor
* @param array{
* default_parser?: string,
* fingerprints?: array<string, string>,
* config_builder?: array,
* config_builder?: array{
* site_config?: string[],
* hostname_regex?: string,
* },
* readability?: array{
* pre_filters?: array<string, string>,
* post_filters?: array<string, string>,
Expand Down Expand Up @@ -497,7 +499,7 @@ public function process(string $html, UriInterface $url, ?SiteConfig $siteConfig
$this->logger->info('Detecting body');
$this->body = $this->readability->getContent();

if (1 === $this->body->childNodes->length && \XML_ELEMENT_NODE === $this->body->firstChild->nodeType) {
if (1 === $this->body->childNodes->length && $this->body->firstChild instanceof \DOMElement) {
$this->body = $this->body->firstChild;
}

Expand All @@ -514,11 +516,11 @@ public function process(string $html, UriInterface $url, ?SiteConfig $siteConfig
if (isset($this->title) && '' !== $this->title && null !== $this->body->firstChild) {
$firstChild = $this->body->firstChild;

while (null !== $firstChild->nextSibling && $firstChild->nodeType && (\XML_ELEMENT_NODE !== $firstChild->nodeType)) {
while (null !== $firstChild->nextSibling && !$firstChild instanceof \DOMElement) {
$firstChild = $firstChild->nextSibling;
}

if (\XML_ELEMENT_NODE === $firstChild->nodeType
if ($firstChild instanceof \DOMElement
&& \in_array(strtolower($firstChild->tagName), ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], true)
&& (strtolower(trim($firstChild->textContent)) === strtolower(trim($this->title)))) {
$this->body->removeChild($firstChild);
Expand Down Expand Up @@ -645,10 +647,7 @@ public function processStringReplacements(string $html, UriInterface $url, ?Site
return $html;
}

/**
* @return \DOMElement|\DOMNode|null
*/
public function getContent()
public function getContent(): ?\DOMElement
{
return $this->body;
}
Expand Down Expand Up @@ -1002,9 +1001,9 @@ private function extractBody(bool $detectBody, string $xpathExpression, ?\DOMNod
$this->logger->info($type . ': found "' . $elems->length . '" with ' . $xpathExpression);

if (1 === $elems->length) {
// body can't be an attribute
if ($elems->item(0) instanceof \DOMAttr) {
$this->logger->info('Body can not be an attribute');
// body can't be anything other than element
if (!$elems->item(0) instanceof \DOMElement) {
$this->logger->info('Body must be an element');

return true;
}
Expand Down Expand Up @@ -1319,6 +1318,8 @@ private function extractOpenGraph(\DOMXPath $xpath): void
/**
* Clean extract of JSON-LD authors.
*
* @param array<mixed> $authors
*
* @return string[]
*/
private function extractAuthorsFromJsonLdArray(array $authors): array
Expand Down
32 changes: 31 additions & 1 deletion src/Extractor/ContentExtractorConfig.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,23 @@ class ContentExtractorConfig
private string $default_parser;
/** @var array<string, string> */
private array $fingerprints;

/**
* @var array{
* site_config?: string[],
* hostname_regex?: string,
* }
*/
private array $config_builder;

/**
* @var array{
* pre_filters: array<string, string>,
* post_filters: array<string, string>,
* }
*/
private array $readability;

/** @var array<string> */
private array $src_lazy_load_attributes;
/** @var array<string> */
Expand All @@ -31,7 +46,10 @@ class ContentExtractorConfig
* @param array{
* default_parser?: string,
* fingerprints?: array<string, string>,
* config_builder?: array,
* config_builder?: array{
* site_config?: string[],
* hostname_regex?: string,
* },
* readability?: array{
* pre_filters?: array<string, string>,
* post_filters?: array<string, string>,
Expand Down Expand Up @@ -129,11 +147,23 @@ public function getFingerprints(): array
return $this->fingerprints;
}

/**
* @return array{
* site_config?: string[],
* hostname_regex?: string,
* }
*/
public function getConfigBuilder(): array
{
return $this->config_builder;
}

/**
* @return array{
* pre_filters: array<string, string>,
* post_filters: array<string, string>,
* }
*/
public function getReadability(): array
{
return $this->readability;
Expand Down
10 changes: 10 additions & 0 deletions src/Extractor/HttpClient.php
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@ class HttpClient

/**
* @param ClientInterface $client Http client
* @param array{
* ua_browser?: string,
* default_referer?: string,
* rewrite_url?: array<array<string, string>>,
* header_only_types?: array<string>,
* header_only_clues?: array<string>,
* user_agents?: array<string, string>,
* ajax_triggers?: array<string>,
* max_redirect?: int,
* } $config
*/
public function __construct(ClientInterface $client, array $config = [], ?LoggerInterface $logger = null, ?ContentExtractor $extractor = null)
{
Expand Down
16 changes: 16 additions & 0 deletions src/Extractor/HttpClientConfig.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,24 @@ class HttpClientConfig
private array $header_only_types;
/** @var array<string> */
private array $header_only_clues;
/** @var array<string, string> Mapping from hostnames to user agent strings */
private array $user_agents;
/** @var array<string> */
private array $ajax_triggers;
private int $max_redirect;

/**
* @param array{
* ua_browser?: string,
* default_referer?: string,
* rewrite_url?: array<array<string, string>>,
* header_only_types?: array<string>,
* header_only_clues?: array<string>,
* user_agents?: array<string, string>,
* ajax_triggers?: array<string>,
* max_redirect?: int,
* } $config
*/
public function __construct(array $config)
{
$resolver = new OptionsResolver();
Expand Down Expand Up @@ -134,6 +147,9 @@ public function getHeaderOnlyClues(): array
return $this->header_only_clues;
}

/**
* @return array<string, string> Mapping from hostnames to user agent strings
*/
public function getUserAgents(): array
{
return $this->user_agents;
Expand Down
75 changes: 69 additions & 6 deletions src/Graby.php
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,46 @@ class Graby

private ?string $prefetchedContent = null;

/**
* @param array{
* debug?: bool,
* log_level?: 'info'|'debug',
* rewrite_relative_urls?: bool,
* singlepage?: bool,
* multipage?: bool,
* error_message?: string,
* error_message_title?: string,
* allowed_urls?: string[],
* blocked_urls?: string[],
* xss_filter?: bool,
* content_type_exc?: array<string, array{name: string, action: 'link'|'exclude'}>,
* content_links?: 'preserve'|'footnotes'|'remove',
* http_client?: array{
* ua_browser?: string,
* default_referer?: string,
* rewrite_url?: array<array<string, string>>,
* header_only_types?: array<string>,
* header_only_clues?: array<string>,
* user_agents?: array<string, string>,
* ajax_triggers?: array<string>,
* max_redirect?: int,
* },
* extractor?: array{
* default_parser?: string,
* fingerprints?: array<string, string>,
* config_builder?: array{
* site_config?: string[],
* hostname_regex?: string,
* },
* readability?: array{
* pre_filters?: array<string, string>,
* post_filters?: array<string, string>,
* },
* src_lazy_load_attributes?: string[],
* json_ld_ignore_types?: string[],
* },
* } $config
*/
public function __construct(array $config = [], ?ClientInterface $client = null, ?ConfigBuilder $configBuilder = null)
{
$this->config = new GrabyConfig($config);
Expand Down Expand Up @@ -149,7 +189,7 @@ public function toggleImgNoReferrer(bool $toggle = false): void
/**
* Cleanup HTML from a DOMElement or a string.
*
* @param string|\DOMElement|\DOMNode $contentBlock
* @param string|\DOMElement $contentBlock
*/
public function cleanupHtml($contentBlock, UriInterface $url): string
{
Expand Down Expand Up @@ -186,13 +226,13 @@ public function cleanupHtml($contentBlock, UriInterface $url): string

// remove empty text nodes
foreach ($contentBlock->childNodes as $n) {
if (\XML_TEXT_NODE === $n->nodeType && '' === trim($n->textContent)) {
if ($n instanceof \DOMText && '' === trim($n->textContent)) {
$contentBlock->removeChild($n);
}
}

// remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p>
while (1 === $contentBlock->childNodes->length && \XML_ELEMENT_NODE === $contentBlock->firstChild->nodeType) {
while (1 === $contentBlock->childNodes->length && $contentBlock->firstChild instanceof \DOMElement) {
// only follow these tag names
if (!\in_array(strtolower($contentBlock->tagName), ['div', 'article', 'section', 'header', 'footer'], true)) {
break;
Expand Down Expand Up @@ -508,8 +548,19 @@ private function isUrlAllowed(string $url): bool
/**
* Based on content-type http header, decide what to do.
*
* @return array With keys: 'mime', 'type', 'subtype', 'action', 'name'
* e.g. array('mime'=>'image/jpeg', 'type'=>'image', 'subtype'=>'jpeg', 'action'=>'link', 'name'=>'Image')
* @return array{
* mime: '',
* } | array{
* mime: string,
* type: string,
* subtype: string,
* } | array{
* mime: string,
* type: string,
* subtype: string,
* action: 'link'|'exclude',
* name: string,
* } E.g. `['mime'=>'image/jpeg', 'type'=>'image', 'subtype'=>'jpeg', 'action'=>'link', 'name'=>'Image']`
*/
private function getMimeActionInfo(ResponseInterface $response): array
{
Expand Down Expand Up @@ -546,7 +597,19 @@ private function getMimeActionInfo(ResponseInterface $response): array
* Handle action related to mime type detection.
* These action can be exclude or link to handle custom content (like image, video, pdf, etc ..).
*
* @param array $mimeInfo From getMimeActionInfo() function
* @param array{
* mime: '',
* } | array{
* mime: string,
* type: string,
* subtype: string,
* } | array{
* mime: string,
* type: string,
* subtype: string,
* action: 'link'|'exclude',
* name: string,
* } $mimeInfo From getMimeActionInfo() function
*/
private function handleMimeAction(array $mimeInfo, EffectiveResponse $response): ?Content
{
Expand Down
Loading

0 comments on commit aade65f

Please sign in to comment.