Skip to content

Commit

Permalink
Add support of site-config's wrap_in()
Browse files Browse the repository at this point in the history
Fixes #180

Signed-off-by: Kevin Decherf <kevin@kdecherf.com>
  • Loading branch information
Kdecherf committed May 29, 2021
1 parent edbf4b8 commit 2aea19f
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 0 deletions.
4 changes: 4 additions & 0 deletions phpstan.neon
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ parameters:
-
message: '#typehint specified.#'
path: %currentWorkingDirectory%/src/
# phpstan does not seem to recognize the class override for JSLikeHTMLElement
-
message: '#Call to an undefined method DOMElement::setInnerHtml\(\)#'
path: %currentWorkingDirectory%/src/Extractor/ContentExtractor.php

inferPrivatePropertyTypeFromConstructor: true
checkMissingIterableValueType: false
Expand Down
42 changes: 42 additions & 0 deletions src/Extractor/ContentExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,20 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smartTidy =
}
}

// wrapping matching elements with provided tag
foreach ($this->siteConfig->wrap_in as $tag => $pattern) {
$this->logger->info('Trying {pattern} to wrap element with {tag}', ['pattern' => $pattern, 'tag' => $tag]);
$elems = $this->xpath->query($pattern, $this->readability->dom);

if (false === $elems) {
$this->logger->info('Bad pattern');

continue;
}

$this->wrapElements($elems, $tag, 'Wrapping {length} elements (wrap_in)');
}

// strip elements (using xpath expressions)
foreach ($this->siteConfig->strip as $pattern) {
$this->logger->info('Trying {pattern} to strip element', ['pattern' => $pattern]);
Expand Down Expand Up @@ -778,6 +792,34 @@ private function removeElements($elems = false, $logMessage = null)
}
}

/**
* Wrap elements with provided tag.
*
* @param \DOMNodeList|false $elems
* @param string $tag
* @param string $logMessage
*/
private function wrapElements($elems = false, $tag = 'div', $logMessage = null)
{
if (false === $elems || false === $this->hasElements($elems)) {
return;
}

if (null !== $logMessage) {
$this->logger->info($logMessage, ['length' => $elems->length]);
}

$a = iterator_to_array($elems);
foreach ($a as $item) {
if (null !== $item && null !== $item->parentNode && $item instanceof \DOMElement) {
$newNode = $item->ownerDocument->createElement($tag);
$newNode->setInnerHtml($item->ownerDocument->saveXML($item));

$item->parentNode->replaceChild($newNode, $item);
}
}
}

/**
* Extract entity for a given CSS class a node.
*
Expand Down
9 changes: 9 additions & 0 deletions src/SiteConfig/ConfigBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ class ConfigBuilder
'accept',
];

// Array of accepted HTML tags for wrap_in()
private $acceptedWrapInTags = [
'blockquote',
'p',
'div',
];

/**
* @param array $config
*/
Expand Down Expand Up @@ -394,6 +401,8 @@ public function parseLines(array $lines)
// special treatment for if_page_contains
} elseif (\in_array($command, ['if_page_contains'], true)) {
$this->handleIfPageContainsCondition($config, $val);
} elseif ((')' === substr($command, -1)) && preg_match('!([a-z0-9_]+)\(([a-z]+)\)$!i', $command, $match) && 'wrap_in' === $match[1] && \in_array(strtolower($match[2]), $this->acceptedWrapInTags, true)) {
$config->wrap_in[strtolower(trim($match[2]))] = $val;
}
}

Expand Down
7 changes: 7 additions & 0 deletions src/SiteConfig/SiteConfig.php
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,13 @@ class SiteConfig
*/
public $skip_json_ld = false;

/**
* Wrap elements matching these xpath expressions with the specified tag (associative array).
*
* @var array
*/
public $wrap_in = [];

/**
* Used if undeclared.
*
Expand Down
49 changes: 49 additions & 0 deletions tests/Extractor/ContentExtractorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -1168,4 +1168,53 @@ public function testBadDate(): void
$this->assertTrue($res, 'Extraction went fine');
$this->assertNull($contentExtractor->getDate(), 'Date got vanish because it was wrong');
}

public function dataForProcessWrapIn(): array
{
return [
// blockquote with a nested div
[
[
'blockquote' => "//div[@class='cond1']",
],
"//blockquote/div[@class='cond1']/p",
],
[
[
'blockquote' => "//div[@class='cond1']/p",
],
"//div[@class='cond1']/blockquote/p",
],
];
}

/**
* Test config wrap_in.
*
* @dataProvider dataForProcessWrapIn
*/
public function testProcessWrapIn(array $wrapIn, string $xpathQuery): void
{
$contentExtractor = new ContentExtractor(self::$contentExtractorConfig);

$config = new SiteConfig();
$config->body = ['//article'];
$config->wrap_in = $wrapIn;

$res = $contentExtractor->process(
'<html><article><div class="cond1"><p>Hello world</p></div></article></html>',
'https://example.com/wrapin',
$config
);

$this->assertTrue($res, 'Extraction went well');

$content_block = $contentExtractor->getContent();
$doc = new \DOMDocument();
$doc->loadXML($content_block->innerHTML);
$xpath = new \DOMXPath($doc);

$el = $xpath->query($xpathQuery);
$this->assertCount(1, $el ?: []);
}
}

0 comments on commit 2aea19f

Please sign in to comment.