diff --git a/phpstan.neon b/phpstan.neon index b12830de..d569f7d7 100644 --- a/phpstan.neon +++ b/phpstan.neon @@ -17,6 +17,10 @@ parameters: - message: '#typehint specified.#' path: %currentWorkingDirectory%/src/ + # phpstan does not seem to recognize the class override for JSLikeHTMLElement + - + message: '#Call to an undefined method DOMElement::setInnerHtml\(\)#' + path: %currentWorkingDirectory%/src/Extractor/ContentExtractor.php inferPrivatePropertyTypeFromConstructor: true checkMissingIterableValueType: false diff --git a/src/Extractor/ContentExtractor.php b/src/Extractor/ContentExtractor.php index 1ba4de3c..9319b8fa 100644 --- a/src/Extractor/ContentExtractor.php +++ b/src/Extractor/ContentExtractor.php @@ -306,6 +306,20 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smartTidy = } } + // wrapping matching elements with provided tag + foreach ($this->siteConfig->wrap_in as $tag => $pattern) { + $this->logger->info('Trying {pattern} to wrap element with {tag}', ['pattern' => $pattern, 'tag' => $tag]); + $elems = $this->xpath->query($pattern, $this->readability->dom); + + if (false === $elems) { + $this->logger->info('Bad pattern'); + + continue; + } + + $this->wrapElements($elems, $tag, 'Wrapping {length} elements (wrap_in)'); + } + // strip elements (using xpath expressions) foreach ($this->siteConfig->strip as $pattern) { $this->logger->info('Trying {pattern} to strip element', ['pattern' => $pattern]); @@ -778,6 +792,34 @@ private function removeElements($elems = false, $logMessage = null) } } + /** + * Wrap elements with provided tag. + * + * @param \DOMNodeList|false $elems + * @param string $tag + * @param string $logMessage + */ + private function wrapElements($elems = false, $tag = 'div', $logMessage = null) + { + if (false === $elems || false === $this->hasElements($elems)) { + return; + } + + if (null !== $logMessage) { + $this->logger->info($logMessage, ['length' => $elems->length]); + } + + $a = iterator_to_array($elems); + foreach ($a as $item) { + if (null !== $item && null !== $item->parentNode && $item instanceof \DOMElement) { + $newNode = $item->ownerDocument->createElement($tag); + $newNode->setInnerHtml($item->ownerDocument->saveXML($item)); + + $item->parentNode->replaceChild($newNode, $item); + } + } + } + /** * Extract entity for a given CSS class a node. * diff --git a/src/SiteConfig/ConfigBuilder.php b/src/SiteConfig/ConfigBuilder.php index 33f8d5a3..e7c8af4e 100644 --- a/src/SiteConfig/ConfigBuilder.php +++ b/src/SiteConfig/ConfigBuilder.php @@ -15,6 +15,21 @@ class ConfigBuilder private $configFiles = []; private $cache = []; + // Array for accepted headers for http_header() + private $acceptedHeaders = [ + 'user-agent', + 'referer', + 'cookie', + 'accept', + ]; + + // Array of accepted HTML tags for wrap_in() + private $acceptedWrapInTags = [ + 'blockquote', + 'p', + 'div', + ]; + /** * @param array $config */ @@ -381,11 +396,13 @@ public function parseLines(array $lines) } elseif ((')' === substr($command, -1)) && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match) && 'replace_string' === $match[1]) { $config->find_string[] = $match[2]; $config->replace_string[] = $val; - } elseif ((')' === substr($command, -1)) && preg_match('!^([a-z0-9_]+)\(([a-z0-9_-]+)\)$!i', $command, $match) && 'http_header' === $match[1] && \in_array(strtolower($match[2]), ['user-agent', 'referer', 'cookie', 'accept'], true)) { + } elseif ((')' === substr($command, -1)) && preg_match('!^([a-z0-9_]+)\(([a-z0-9_-]+)\)$!i', $command, $match) && 'http_header' === $match[1] && \in_array(strtolower($match[2]), $this->acceptedHeaders, true)) { $config->http_header[strtolower(trim($match[2]))] = $val; // special treatment for if_page_contains } elseif (\in_array($command, ['if_page_contains'], true)) { $this->handleIfPageContainsCondition($config, $val); + } elseif ((')' === substr($command, -1)) && preg_match('!([a-z0-9_]+)\(([a-z]+)\)$!i', $command, $match) && 'wrap_in' === $match[1] && \in_array(strtolower($match[2]), $this->acceptedWrapInTags, true)) { + $config->wrap_in[strtolower(trim($match[2]))] = $val; } } diff --git a/src/SiteConfig/SiteConfig.php b/src/SiteConfig/SiteConfig.php index 826d66f1..65f63c23 100644 --- a/src/SiteConfig/SiteConfig.php +++ b/src/SiteConfig/SiteConfig.php @@ -221,6 +221,13 @@ class SiteConfig */ public $skip_json_ld = false; + /** + * Wrap elements matching these xpath expressions with the specified tag (associative array). + * + * @var array + */ + public $wrap_in = []; + /** * Used if undeclared. * diff --git a/tests/Extractor/ContentExtractorTest.php b/tests/Extractor/ContentExtractorTest.php index fcc0516e..74fec772 100644 --- a/tests/Extractor/ContentExtractorTest.php +++ b/tests/Extractor/ContentExtractorTest.php @@ -1168,4 +1168,53 @@ public function testBadDate(): void $this->assertTrue($res, 'Extraction went fine'); $this->assertNull($contentExtractor->getDate(), 'Date got vanish because it was wrong'); } + + public function dataForProcessWrapIn(): array + { + return [ + // blockquote with a nested div + [ + [ + 'blockquote' => "//div[@class='cond1']", + ], + "//blockquote/div[@class='cond1']/p", + ], + [ + [ + 'blockquote' => "//div[@class='cond1']/p", + ], + "//div[@class='cond1']/blockquote/p", + ], + ]; + } + + /** + * Test config wrap_in. + * + * @dataProvider dataForProcessWrapIn + */ + public function testProcessWrapIn(array $wrapIn, string $xpathQuery): void + { + $contentExtractor = new ContentExtractor(self::$contentExtractorConfig); + + $config = new SiteConfig(); + $config->body = ['//article']; + $config->wrap_in = $wrapIn; + + $res = $contentExtractor->process( + '

Hello world

', + 'https://example.com/wrapin', + $config + ); + + $this->assertTrue($res, 'Extraction went well'); + + $content_block = $contentExtractor->getContent(); + $doc = new \DOMDocument(); + $doc->loadXML($content_block->innerHTML); + $xpath = new \DOMXPath($doc); + + $el = $xpath->query($xpathQuery); + $this->assertCount(1, $el ?: []); + } }