diff --git a/src/features/metadata_base.py b/src/features/metadata_base.py index 01d1403..94a2ebe 100644 --- a/src/features/metadata_base.py +++ b/src/features/metadata_base.py @@ -1,10 +1,8 @@ import asyncio -import multiprocessing import os import re from collections import OrderedDict from enum import Enum -from itertools import repeat from urllib.parse import urlparse import adblockparser @@ -34,11 +32,11 @@ class MetadataBaseException(Exception): pass -def _parallel_rule_matching(rules, html, options): - return [rule for rule in rules if rule.match_url(html, options)] - - class MetadataBase: + """ + Base class for features to be extracted. + """ + tag_list: list = [] tag_list_last_modified = "" tag_list_expires: int = 0 @@ -202,14 +200,21 @@ def _work_header(self, header): def _extract_raw_links(soup: BeautifulSoup) -> list: return list({a["href"] for a in soup.find_all(href=True)}) - def _parse_adblock_rules( - self, website_data: WebsiteData, html: str - ) -> list: + def _parse_adblock_rules(self, website_data: WebsiteData) -> list: values = [] + self.adblockparser_options["domain"] = website_data.top_level_domain + for url in website_data.raw_links: - is_blocked = self.match_rules.should_block(url) - if is_blocked: - values.append(url) + values += [ + el.group() + for el in self.match_rules.blacklist_re.finditer(url) + ] + values += [ + rule + for rule in self.match_rules.blacklist_with_options + if rule.match_url(url, self.adblockparser_options) + ] + return values def _work_html_content(self, website_data: WebsiteData) -> list: @@ -217,13 +222,11 @@ def _work_html_content(self, website_data: WebsiteData) -> list: self._logger.info(f"{self.__class__.__name__},{len(self.tag_list)}") if self.tag_list: - html = "".join(website_data.html) if self.extraction_method == ExtractionMethod.MATCH_DIRECTLY: + html = "".join(website_data.html) values = [ele for ele in self.tag_list if html.find(ele) >= 0] elif self.extraction_method == ExtractionMethod.USE_ADBLOCK_PARSER: - values = self._parse_adblock_rules( - website_data=website_data, html=html - ) + values = self._parse_adblock_rules(website_data=website_data) return values diff --git a/tests/integration/features_integration_test.py b/tests/integration/features_integration_test.py new file mode 100644 index 0000000..e2c66fa --- /dev/null +++ b/tests/integration/features_integration_test.py @@ -0,0 +1,56 @@ +from features.html_based import Advertisement +from features.website_manager import WebsiteData, WebsiteManager +from lib.logger import create_logger + +# TODO Check other features, e.g. adult: +# html = { +# "html": "9content.com\n,ytimm.com\n,boyzshop.com/affimages/", +# "har": "", +# "url": "", +# } +# expected = { +# "easylist_adult": { +# "values": ["9content.com", "ad_slot="], +# "runs_within": 10, # time the evaluation may take AT MAX -> acceptance test} +# } +# } + + +def test_advertisement(mocker): + _logger = create_logger() + + advertisement = Advertisement(_logger) + + advertisement.setup() + website_manager = WebsiteManager.get_instance() + + html = { + "html": "ad_block, ad_slot= mallorcash.com admanmedia murkymouse.online", + "har": "", + "url": "", + "headers": "{}", + } + expected = { + "advertisement": { + "values": [], # ["ad_block", "ad_slot="] + "runs_within": 10, # time the evaluation may take AT MAX -> acceptance test + }, + } + + website_manager.load_raw_data(html) + website_data = WebsiteData(html=html["html"], raw_header="", headers={}) + website_data.raw_links = [html["html"]] + website_data.html = html["html"] + + data = advertisement.start() + + website_manager.reset() + + assert ( + data["advertisement"]["values"] == expected["advertisement"]["values"] + ) + runs_fast_enough = ( + data["advertisement"]["time_required"] + <= expected["advertisement"]["runs_within"] + ) + assert runs_fast_enough diff --git a/tests/unit/metadatabase_test.py b/tests/unit/metadatabase_test.py index 520bb19..0c2ec3c 100644 --- a/tests/unit/metadatabase_test.py +++ b/tests/unit/metadatabase_test.py @@ -1,4 +1,3 @@ -import asyncio from unittest import mock import adblockparser @@ -49,7 +48,6 @@ def test_start(metadatabase: MetadataBase, mocker): assert values_has_only_one_key assert values[metadatabase.key]["values"] == [] - # TODO: An if in a test -> is this a bad idea? if "tag_list_last_modified" in values[metadatabase.key].keys(): assert values[metadatabase.key]["tag_list_last_modified"] == "" assert values[metadatabase.key]["tag_list_expires"] == 0