Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reimplement full-text scraping #563

Merged
merged 3 commits into from
Dec 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@
"arthurhoaro/favicon": "^1.2",
"ext-json": "*",
"ext-simplexml": "*",
"ext-libxml": "*"
"ext-libxml": "*",
"andreskrey/readability.php": "^2.1"
},
"require-dev": {
"phpunit/phpunit": "^7.5",
Expand Down
55 changes: 54 additions & 1 deletion composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions lib/AppInfo/Application.php
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
use OCA\News\Fetcher\Fetcher;
use OCA\News\Fetcher\YoutubeFetcher;
use OCA\News\Utility\ProxyConfigParser;
use OCA\News\Scraper\Scraper;

/**
* Class Application
Expand Down Expand Up @@ -193,5 +194,14 @@ public function __construct(array $urlParams = [])
$fetcher->registerFetcher($c->query(FeedFetcher::class));
return $fetcher;
});

/**
* Scrapers
*/
$container->registerService(Scraper::class, function (IContainer $c): Scraper {
return new Scraper(
$c->query(PsrLogger::class)
);
});
}
}
6 changes: 4 additions & 2 deletions lib/Command/ShowFeed.php
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,19 @@ protected function configure()
->setDescription('Prints a JSON string which represents the given feed as it would be in the DB.')
->addArgument('feed', InputArgument::REQUIRED, 'Feed to parse')
->addOption('user', 'u', InputOption::VALUE_OPTIONAL, 'Username for the feed')
->addOption('password', 'p', InputOption::VALUE_OPTIONAL, 'Password for the feed');
->addOption('password', 'p', InputOption::VALUE_OPTIONAL, 'Password for the feed')
->addOption('full-text', 'f', InputOption::VALUE_NONE, 'Usa a scraper to get full text');
}

protected function execute(InputInterface $input, OutputInterface $output)
{
$url = $input->getArgument('feed');
$user = $input->getOption('user');
$password = $input->getOption('password');
$fullTextEnabled = (bool) $input->getOption('full-text');

try {
list($feed, $items) = $this->feedFetcher->fetch($url, true, null, $user, $password);
list($feed, $items) = $this->feedFetcher->fetch($url, true, null, $fullTextEnabled, $user, $password);
$output->writeln("Feed: " . json_encode($feed, JSON_PRETTY_PRINT));
$output->writeln("Items: " . json_encode($items, JSON_PRETTY_PRINT));
} catch (\Throwable $ex) {
Expand Down
58 changes: 43 additions & 15 deletions lib/Fetcher/FeedFetcher.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
use OCA\News\Db\Item;
use OCA\News\Db\Feed;
use OCA\News\Utility\Time;
use OCA\News\Scraper\Scraper;
use SimpleXMLElement;

class FeedFetcher implements IFeedFetcher
Expand All @@ -36,14 +37,22 @@ class FeedFetcher implements IFeedFetcher
private $l10n;
private $time;
private $logger;

public function __construct(FeedIo $fetcher, Favicon $favicon, IL10N $l10n, Time $time, PsrLogger $logger)
{
private $scraper;

public function __construct(
FeedIo $fetcher,
Favicon $favicon,
IL10N $l10n,
Time $time,
PsrLogger $logger,
Scraper $scraper
) {
$this->reader = $fetcher;
$this->faviconFactory = $favicon;
$this->l10n = $l10n;
$this->time = $time;
$this->logger = $logger;
$this->scraper = $scraper;
}


Expand All @@ -65,7 +74,7 @@ public function canHandle($url): bool
*
* @inheritdoc
*/
public function fetch(string $url, bool $favicon, $lastModified, $user, $password): array
public function fetch(string $url, bool $favicon, $lastModified, bool $fullTextEnabled, $user, $password): array
{
$url2 = new Net_URL2($url);
if (!empty($user) && !empty(trim($user))) {
Expand Down Expand Up @@ -99,12 +108,32 @@ public function fetch(string $url, bool $favicon, $lastModified, $user, $passwor
);

$items = [];
$RTL = $this->determineRtl($parsedFeed);
$feedName = $parsedFeed->getTitle();
$this->logger->debug('Feed {url} was modified since last fetch. #{count} items', [
'url' => $url,
'count' => count($parsedFeed),
]);

foreach ($parsedFeed as $item) {
$items[] = $this->buildItem($item, $parsedFeed);
$body = null;
$currRTL = $RTL;

// Scrape content if enabled
if ($fullTextEnabled) {
if ($this->scraper->scrape($item->getLink())) {
$body = $this->scraper->getContent();
$currRTL = $this->scraper->getRTL($currRTL);
}
}

$builtItem = $this->buildItem($item, $body, $currRTL);
$this->logger->debug('Added item {title} for feed {feed} publishdate: {datetime}', [
'title' => $builtItem->getTitle(),
'feed' => $feedName,
'datetime' => $builtItem->getLastModified(),
]);
$items[] = $builtItem;
}

return [$feed, $items];
Expand Down Expand Up @@ -164,11 +193,12 @@ protected function determineRtl(FeedInterface $parsedFeed): bool
* Build an item based on a feed.
*
* @param ItemInterface $parsedItem The item to use
* @param FeedInterface $parsedFeed The feed to use
* @param string $body Text of the item, if not provided use description from $parsedItem
* @param bool $RTL True if the feed is RTL (Right-to-left)
*
* @return Item
*/
protected function buildItem(ItemInterface $parsedItem, FeedInterface $parsedFeed): Item
protected function buildItem(ItemInterface $parsedItem, string $body = null, bool $RTL = false): Item
{
$item = new Item();
$item->setUnread(true);
Expand All @@ -188,7 +218,7 @@ protected function buildItem(ItemInterface $parsedItem, FeedInterface $parsedFee
$item->setPubDate($pubDT->getTimestamp());

$item->setLastModified($lastmodified->getTimestamp());
$item->setRtl($this->determineRtl($parsedFeed));
$item->setRtl($RTL);

// unescape content because angularjs helps against XSS
$item->setTitle($this->decodeTwice($parsedItem->getTitle()));
Expand All @@ -197,8 +227,12 @@ protected function buildItem(ItemInterface $parsedItem, FeedInterface $parsedFee
$item->setAuthor($this->decodeTwice($author->getName()));
}

// Use description from feed if body is not provided (by a scraper)
if ($body === null) {
$body = $parsedItem->getValue("content:encoded") ?? $parsedItem->getDescription();
}

// purification is done in the service layer
$body = $parsedItem->getValue("content:encoded") ?? $parsedItem->getDescription();
$body = mb_convert_encoding(
$body,
'HTML-ENTITIES',
Expand Down Expand Up @@ -231,12 +265,6 @@ protected function buildItem(ItemInterface $parsedItem, FeedInterface $parsedFee
}

$item->generateSearchIndex();

$this->logger->debug('Added item {title} for feed {feed} publishdate: {datetime}', [
'title' => $item->getTitle(),
'feed' => $parsedFeed->getTitle(),
'datetime' => $item->getLastModified(),
]);
return $item;
}

Expand Down
18 changes: 13 additions & 5 deletions lib/Fetcher/Fetcher.php
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,24 @@ public function registerFetcher(IFeedFetcher $fetcher)
*
* @param string $url remote url of the feed
* @param boolean $getFavicon if the favicon should also be fetched, defaults to true
* @param string $lastModified a last modified value from an http header defaults to false.
* @param string $lastModified a last modified value from an http header defaults to false.
* If lastModified matches the http header from the feed no results are fetched
* @param string $user if given, basic auth is set for this feed
* @param string $password if given, basic auth is set for this feed. Ignored if user is empty
* @param bool $fullTextEnabled If true use a scraper to download the full article
* @param string $user if given, basic auth is set for this feed
* @param string $password if given, basic auth is set for this feed. Ignored if user is empty
*
* @throws ReadErrorException if FeedIO fails
* @return array an array containing the new feed and its items, first
* element being the Feed and second element being an array of Items
*/
public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null)
{
public function fetch(
$url,
$getFavicon = true,
$lastModified = null,
$fullTextEnabled = false,
$user = null,
$password = null
) {
foreach ($this->fetchers as $fetcher) {
if (!$fetcher->canHandle($url)) {
continue;
Expand All @@ -64,6 +71,7 @@ public function fetch($url, $getFavicon = true, $lastModified = null, $user = nu
$url,
$getFavicon,
$lastModified,
$fullTextEnabled,
$user,
$password
);
Expand Down
3 changes: 2 additions & 1 deletion lib/Fetcher/IFeedFetcher.php
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@ interface IFeedFetcher
* @param boolean $favicon if the favicon should also be fetched, defaults to true
* @param string|null $lastModified a last modified value from an http header defaults to false.
* If lastModified matches the http header from the feed no results are fetched
* @param bool $fullTextEnabled If true use a scraper to download the full article
* @param string|null $user if given, basic auth is set for this feed
* @param string|null $password if given, basic auth is set for this feed. Ignored if user is empty
*
* @return array an array containing the new feed and its items, first
* element being the Feed and second element being an array of Items
* @throws ReadErrorException if the Feed-IO fetcher encounters a problem
*/
public function fetch(string $url, bool $favicon, $lastModified, $user, $password): array;
public function fetch(string $url, bool $favicon, $lastModified, bool $fullTextEnabled, $user, $password): array;

/**
* Can a fetcher handle a feed.
Expand Down
3 changes: 2 additions & 1 deletion lib/Fetcher/YoutubeFetcher.php
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,15 @@ public function canHandle($url): bool
*
* @inheritdoc
*/
public function fetch(string $url, bool $favicon, $lastModified, $user, $password): array
public function fetch(string $url, bool $favicon, $lastModified, bool $fullTextEnabled, $user, $password): array
{
$transformedUrl = $this->buildUrl($url);

$result = $this->feedFetcher->fetch(
$transformedUrl,
$favicon,
$lastModified,
$fullTextEnabled,
$user,
$password
);
Expand Down
43 changes: 43 additions & 0 deletions lib/Scraper/IScraper.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
<?php
/**
* Nextcloud - News
*
* This file is licensed under the Affero General Public License version 3 or
* later. See the COPYING file.
*
* @author Gioele Falcetti <thegio.f@gmail.com>
* @copyright 2019 Gioele Falcetti
*/

namespace OCA\News\Scraper;

interface IScraper
{
/**
* Scrape feed url
*
* @param string $url
*
* @return bool False if failed
*
*/
public function scrape(string $url): bool;

/**
* Get the scraped content
*
* @return string
*
*/
public function getContent(): string;

/**
* Get the RTL (rigth-to-left) information
*
* @param bool $default Return this value if the scraper is unable to determine it
*
* @return bool
*
*/
public function getRTL(bool $default = false): bool;
}
Loading