From 8bb8da1c8c1dccf214e569daa7d28cbf82a8f8da Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sun, 29 Sep 2024 05:20:21 +0200 Subject: [PATCH] Fix DOM related PHPStan issues - Remove `DOMNode` from `Graby::$body` union type, it is obtained from `Readability::getContent()`, which only returns `DOMElement`. - Remove `DOMNode` from `Graby::cleanupHtml()` union type for `$contentBlock` parameter, it is obtained from `ContentExtractor::getContent()`, which only returns `DOMElement`. - Use `instanceof` checks rather than comparing `DOMNode::$nodeType` property to make the actual class clear to PHPStan. - Discard non-element bodies in `ContentExtractor::extractBody()`, `ContentExtractor::process()` assumes the body is an `Element` as it checks if its `tagName` is `iframe`. --- phpstan.neon | 10 ---------- src/Extractor/ContentExtractor.php | 20 ++++++++------------ src/Graby.php | 6 +++--- 3 files changed, 11 insertions(+), 25 deletions(-) diff --git a/phpstan.neon b/phpstan.neon index 65e2463..aaf5290 100644 --- a/phpstan.neon +++ b/phpstan.neon @@ -20,15 +20,5 @@ parameters: - message: '#\$innerHTML#' path: %currentWorkingDirectory% - # other stuff I might have fucked up with DOM* classes - - - message: '#DOMNode::\$tagName#' - path: %currentWorkingDirectory%/src/ - - - message: '#DOMNode::getElementsByTagName#' - path: %currentWorkingDirectory%/src/ - - - message: '#expects DOMElement, DOMNode given#' - path: %currentWorkingDirectory%/src/Graby.php inferPrivatePropertyTypeFromConstructor: true diff --git a/src/Extractor/ContentExtractor.php b/src/Extractor/ContentExtractor.php index 65eeac0..e8d6db2 100644 --- a/src/Extractor/ContentExtractor.php +++ b/src/Extractor/ContentExtractor.php @@ -28,8 +28,7 @@ class ContentExtractor private ?string $language = null; /** @var string[] */ private array $authors = []; - /** @var \DOMElement|\DOMNode|null */ - private $body = null; + private ?\DOMElement $body = null; private ?string $image = null; private bool $nativeAd = false; private ?string $date = null; @@ -500,7 +499,7 @@ public function process(string $html, UriInterface $url, ?SiteConfig $siteConfig $this->logger->info('Detecting body'); $this->body = $this->readability->getContent(); - if (1 === $this->body->childNodes->length && \XML_ELEMENT_NODE === $this->body->firstChild->nodeType) { + if (1 === $this->body->childNodes->length && $this->body->firstChild instanceof \DOMElement) { $this->body = $this->body->firstChild; } @@ -517,11 +516,11 @@ public function process(string $html, UriInterface $url, ?SiteConfig $siteConfig if (isset($this->title) && '' !== $this->title && null !== $this->body->firstChild) { $firstChild = $this->body->firstChild; - while (null !== $firstChild->nextSibling && $firstChild->nodeType && (\XML_ELEMENT_NODE !== $firstChild->nodeType)) { + while (null !== $firstChild->nextSibling && !$firstChild instanceof \DOMElement) { $firstChild = $firstChild->nextSibling; } - if (\XML_ELEMENT_NODE === $firstChild->nodeType + if ($firstChild instanceof \DOMElement && \in_array(strtolower($firstChild->tagName), ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], true) && (strtolower(trim($firstChild->textContent)) === strtolower(trim($this->title)))) { $this->body->removeChild($firstChild); @@ -648,10 +647,7 @@ public function processStringReplacements(string $html, UriInterface $url, ?Site return $html; } - /** - * @return \DOMElement|\DOMNode|null - */ - public function getContent() + public function getContent(): ?\DOMElement { return $this->body; } @@ -1005,9 +1001,9 @@ private function extractBody(bool $detectBody, string $xpathExpression, ?\DOMNod $this->logger->info($type . ': found "' . $elems->length . '" with ' . $xpathExpression); if (1 === $elems->length) { - // body can't be an attribute - if ($elems->item(0) instanceof \DOMAttr) { - $this->logger->info('Body can not be an attribute'); + // body can't be anything other than element + if (!$elems->item(0) instanceof \DOMElement) { + $this->logger->info('Body must be an element'); return true; } diff --git a/src/Graby.php b/src/Graby.php index 91e0bec..2e659ec 100644 --- a/src/Graby.php +++ b/src/Graby.php @@ -189,7 +189,7 @@ public function toggleImgNoReferrer(bool $toggle = false): void /** * Cleanup HTML from a DOMElement or a string. * - * @param string|\DOMElement|\DOMNode $contentBlock + * @param string|\DOMElement $contentBlock */ public function cleanupHtml($contentBlock, UriInterface $url): string { @@ -226,13 +226,13 @@ public function cleanupHtml($contentBlock, UriInterface $url): string // remove empty text nodes foreach ($contentBlock->childNodes as $n) { - if (\XML_TEXT_NODE === $n->nodeType && '' === trim($n->textContent)) { + if ($n instanceof \DOMText && '' === trim($n->textContent)) { $contentBlock->removeChild($n); } } // remove nesting:

test

=

test

- while (1 === $contentBlock->childNodes->length && \XML_ELEMENT_NODE === $contentBlock->firstChild->nodeType) { + while (1 === $contentBlock->childNodes->length && $contentBlock->firstChild instanceof \DOMElement) { // only follow these tag names if (!\in_array(strtolower($contentBlock->tagName), ['div', 'article', 'section', 'header', 'footer'], true)) { break;