Skip to content

Commit

Permalink
Fix DOM related PHPStan issues
Browse files Browse the repository at this point in the history
- Remove `DOMNode` from `Graby::$body` union type, it is obtained from `Readability::getContent()`, which only returns `DOMElement`.
- Remove `DOMNode` from `Graby::cleanupHtml()` union type for `$contentBlock` parameter, it is obtained from `ContentExtractor::getContent()`, which only returns `DOMElement`.
- Use `instanceof` checks rather than comparing `DOMNode::$nodeType` property to make the actual class clear to PHPStan.
- Discard non-element bodies in `ContentExtractor::extractBody()`, `ContentExtractor::process()` assumes the body is an `Element` as it checks if its `tagName` is `iframe`.
  • Loading branch information
jtojnar committed Oct 8, 2024
1 parent 94cb3a4 commit 8bb8da1
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 25 deletions.
10 changes: 0 additions & 10 deletions phpstan.neon
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,5 @@ parameters:
-
message: '#\$innerHTML#'
path: %currentWorkingDirectory%
# other stuff I might have fucked up with DOM* classes
-
message: '#DOMNode::\$tagName#'
path: %currentWorkingDirectory%/src/
-
message: '#DOMNode::getElementsByTagName#'
path: %currentWorkingDirectory%/src/
-
message: '#expects DOMElement, DOMNode given#'
path: %currentWorkingDirectory%/src/Graby.php

inferPrivatePropertyTypeFromConstructor: true
20 changes: 8 additions & 12 deletions src/Extractor/ContentExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ class ContentExtractor
private ?string $language = null;
/** @var string[] */
private array $authors = [];
/** @var \DOMElement|\DOMNode|null */
private $body = null;
private ?\DOMElement $body = null;
private ?string $image = null;
private bool $nativeAd = false;
private ?string $date = null;
Expand Down Expand Up @@ -500,7 +499,7 @@ public function process(string $html, UriInterface $url, ?SiteConfig $siteConfig
$this->logger->info('Detecting body');
$this->body = $this->readability->getContent();

if (1 === $this->body->childNodes->length && \XML_ELEMENT_NODE === $this->body->firstChild->nodeType) {
if (1 === $this->body->childNodes->length && $this->body->firstChild instanceof \DOMElement) {
$this->body = $this->body->firstChild;
}

Expand All @@ -517,11 +516,11 @@ public function process(string $html, UriInterface $url, ?SiteConfig $siteConfig
if (isset($this->title) && '' !== $this->title && null !== $this->body->firstChild) {
$firstChild = $this->body->firstChild;

while (null !== $firstChild->nextSibling && $firstChild->nodeType && (\XML_ELEMENT_NODE !== $firstChild->nodeType)) {
while (null !== $firstChild->nextSibling && !$firstChild instanceof \DOMElement) {
$firstChild = $firstChild->nextSibling;
}

if (\XML_ELEMENT_NODE === $firstChild->nodeType
if ($firstChild instanceof \DOMElement
&& \in_array(strtolower($firstChild->tagName), ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], true)
&& (strtolower(trim($firstChild->textContent)) === strtolower(trim($this->title)))) {
$this->body->removeChild($firstChild);
Expand Down Expand Up @@ -648,10 +647,7 @@ public function processStringReplacements(string $html, UriInterface $url, ?Site
return $html;
}

/**
* @return \DOMElement|\DOMNode|null
*/
public function getContent()
public function getContent(): ?\DOMElement
{
return $this->body;
}
Expand Down Expand Up @@ -1005,9 +1001,9 @@ private function extractBody(bool $detectBody, string $xpathExpression, ?\DOMNod
$this->logger->info($type . ': found "' . $elems->length . '" with ' . $xpathExpression);

if (1 === $elems->length) {
// body can't be an attribute
if ($elems->item(0) instanceof \DOMAttr) {
$this->logger->info('Body can not be an attribute');
// body can't be anything other than element
if (!$elems->item(0) instanceof \DOMElement) {
$this->logger->info('Body must be an element');

return true;
}
Expand Down
6 changes: 3 additions & 3 deletions src/Graby.php
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ public function toggleImgNoReferrer(bool $toggle = false): void
/**
* Cleanup HTML from a DOMElement or a string.
*
* @param string|\DOMElement|\DOMNode $contentBlock
* @param string|\DOMElement $contentBlock
*/
public function cleanupHtml($contentBlock, UriInterface $url): string
{
Expand Down Expand Up @@ -226,13 +226,13 @@ public function cleanupHtml($contentBlock, UriInterface $url): string

// remove empty text nodes
foreach ($contentBlock->childNodes as $n) {
if (\XML_TEXT_NODE === $n->nodeType && '' === trim($n->textContent)) {
if ($n instanceof \DOMText && '' === trim($n->textContent)) {
$contentBlock->removeChild($n);
}
}

// remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p>
while (1 === $contentBlock->childNodes->length && \XML_ELEMENT_NODE === $contentBlock->firstChild->nodeType) {
while (1 === $contentBlock->childNodes->length && $contentBlock->firstChild instanceof \DOMElement) {
// only follow these tag names
if (!\in_array(strtolower($contentBlock->tagName), ['div', 'article', 'section', 'header', 'footer'], true)) {
break;
Expand Down

0 comments on commit 8bb8da1

Please sign in to comment.