From 2aea19fdb074502ed121820a68b7a9d13c893102 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Sat, 22 May 2021 14:27:41 +0200 Subject: [PATCH] Add support of site-config's wrap_in() Fixes #180 Signed-off-by: Kevin Decherf --- phpstan.neon | 4 ++ src/Extractor/ContentExtractor.php | 42 ++++++++++++++++++++ src/SiteConfig/ConfigBuilder.php | 9 +++++ src/SiteConfig/SiteConfig.php | 7 ++++ tests/Extractor/ContentExtractorTest.php | 49 ++++++++++++++++++++++++ 5 files changed, 111 insertions(+) diff --git a/phpstan.neon b/phpstan.neon index b12830de..d569f7d7 100644 --- a/phpstan.neon +++ b/phpstan.neon @@ -17,6 +17,10 @@ parameters: - message: '#typehint specified.#' path: %currentWorkingDirectory%/src/ + # phpstan does not seem to recognize the class override for JSLikeHTMLElement + - + message: '#Call to an undefined method DOMElement::setInnerHtml\(\)#' + path: %currentWorkingDirectory%/src/Extractor/ContentExtractor.php inferPrivatePropertyTypeFromConstructor: true checkMissingIterableValueType: false diff --git a/src/Extractor/ContentExtractor.php b/src/Extractor/ContentExtractor.php index 1ba4de3c..9319b8fa 100644 --- a/src/Extractor/ContentExtractor.php +++ b/src/Extractor/ContentExtractor.php @@ -306,6 +306,20 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smartTidy = } } + // wrapping matching elements with provided tag + foreach ($this->siteConfig->wrap_in as $tag => $pattern) { + $this->logger->info('Trying {pattern} to wrap element with {tag}', ['pattern' => $pattern, 'tag' => $tag]); + $elems = $this->xpath->query($pattern, $this->readability->dom); + + if (false === $elems) { + $this->logger->info('Bad pattern'); + + continue; + } + + $this->wrapElements($elems, $tag, 'Wrapping {length} elements (wrap_in)'); + } + // strip elements (using xpath expressions) foreach ($this->siteConfig->strip as $pattern) { $this->logger->info('Trying {pattern} to strip element', ['pattern' => $pattern]); @@ -778,6 +792,34 @@ private function removeElements($elems = false, $logMessage = null) } } + /** + * Wrap elements with provided tag. + * + * @param \DOMNodeList|false $elems + * @param string $tag + * @param string $logMessage + */ + private function wrapElements($elems = false, $tag = 'div', $logMessage = null) + { + if (false === $elems || false === $this->hasElements($elems)) { + return; + } + + if (null !== $logMessage) { + $this->logger->info($logMessage, ['length' => $elems->length]); + } + + $a = iterator_to_array($elems); + foreach ($a as $item) { + if (null !== $item && null !== $item->parentNode && $item instanceof \DOMElement) { + $newNode = $item->ownerDocument->createElement($tag); + $newNode->setInnerHtml($item->ownerDocument->saveXML($item)); + + $item->parentNode->replaceChild($newNode, $item); + } + } + } + /** * Extract entity for a given CSS class a node. * diff --git a/src/SiteConfig/ConfigBuilder.php b/src/SiteConfig/ConfigBuilder.php index 14188a41..e7c8af4e 100644 --- a/src/SiteConfig/ConfigBuilder.php +++ b/src/SiteConfig/ConfigBuilder.php @@ -23,6 +23,13 @@ class ConfigBuilder 'accept', ]; + // Array of accepted HTML tags for wrap_in() + private $acceptedWrapInTags = [ + 'blockquote', + 'p', + 'div', + ]; + /** * @param array $config */ @@ -394,6 +401,8 @@ public function parseLines(array $lines) // special treatment for if_page_contains } elseif (\in_array($command, ['if_page_contains'], true)) { $this->handleIfPageContainsCondition($config, $val); + } elseif ((')' === substr($command, -1)) && preg_match('!([a-z0-9_]+)\(([a-z]+)\)$!i', $command, $match) && 'wrap_in' === $match[1] && \in_array(strtolower($match[2]), $this->acceptedWrapInTags, true)) { + $config->wrap_in[strtolower(trim($match[2]))] = $val; } } diff --git a/src/SiteConfig/SiteConfig.php b/src/SiteConfig/SiteConfig.php index 826d66f1..65f63c23 100644 --- a/src/SiteConfig/SiteConfig.php +++ b/src/SiteConfig/SiteConfig.php @@ -221,6 +221,13 @@ class SiteConfig */ public $skip_json_ld = false; + /** + * Wrap elements matching these xpath expressions with the specified tag (associative array). + * + * @var array + */ + public $wrap_in = []; + /** * Used if undeclared. * diff --git a/tests/Extractor/ContentExtractorTest.php b/tests/Extractor/ContentExtractorTest.php index fcc0516e..74fec772 100644 --- a/tests/Extractor/ContentExtractorTest.php +++ b/tests/Extractor/ContentExtractorTest.php @@ -1168,4 +1168,53 @@ public function testBadDate(): void $this->assertTrue($res, 'Extraction went fine'); $this->assertNull($contentExtractor->getDate(), 'Date got vanish because it was wrong'); } + + public function dataForProcessWrapIn(): array + { + return [ + // blockquote with a nested div + [ + [ + 'blockquote' => "//div[@class='cond1']", + ], + "//blockquote/div[@class='cond1']/p", + ], + [ + [ + 'blockquote' => "//div[@class='cond1']/p", + ], + "//div[@class='cond1']/blockquote/p", + ], + ]; + } + + /** + * Test config wrap_in. + * + * @dataProvider dataForProcessWrapIn + */ + public function testProcessWrapIn(array $wrapIn, string $xpathQuery): void + { + $contentExtractor = new ContentExtractor(self::$contentExtractorConfig); + + $config = new SiteConfig(); + $config->body = ['//article']; + $config->wrap_in = $wrapIn; + + $res = $contentExtractor->process( + '

Hello world

', + 'https://example.com/wrapin', + $config + ); + + $this->assertTrue($res, 'Extraction went well'); + + $content_block = $contentExtractor->getContent(); + $doc = new \DOMDocument(); + $doc->loadXML($content_block->innerHTML); + $xpath = new \DOMXPath($doc); + + $el = $xpath->query($xpathQuery); + $this->assertCount(1, $el ?: []); + } }