Bug list checking (#39)

* - Bug fixing * - Performance optimization adblockparser * - Correcting tag list urls - * - WIP: Replace asyncio with multiprocessing * - Resolved bug of missing logger * - Replace asyncio with multiprocessing * - Cleanup * - Cleanup - Making long running adblockparsing processes async and parallel * - Implementing _astart for html based features * - Removing unneeded allowlists - Reducing number of rools per pool * - Test coverage * - WIP: Building tests * - WIP: Thoughts * - Reverting parallel processing for simplification * - Adding advertisment feature integration test
codecentric · Jan 4, 2021 · 6b1aaf6 · 6b1aaf6
1 parent 0a73de4
commit 6b1aaf6
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 18 deletions.
diff --git a/src/features/metadata_base.py b/src/features/metadata_base.py
@@ -1,10 +1,8 @@
 import asyncio
-import multiprocessing
 import os
 import re
 from collections import OrderedDict
 from enum import Enum
-from itertools import repeat
 from urllib.parse import urlparse
 
 import adblockparser
@@ -34,11 +32,11 @@ class MetadataBaseException(Exception):
     pass
 
 
-def _parallel_rule_matching(rules, html, options):
-    return [rule for rule in rules if rule.match_url(html, options)]
-
-
 class MetadataBase:
+    """
+    Base class for features to be extracted.
+    """
+
     tag_list: list = []
     tag_list_last_modified = ""
     tag_list_expires: int = 0
@@ -202,28 +200,33 @@ def _work_header(self, header):
     def _extract_raw_links(soup: BeautifulSoup) -> list:
         return list({a["href"] for a in soup.find_all(href=True)})
 
-    def _parse_adblock_rules(
-        self, website_data: WebsiteData, html: str
-    ) -> list:
+    def _parse_adblock_rules(self, website_data: WebsiteData) -> list:
         values = []
+        self.adblockparser_options["domain"] = website_data.top_level_domain
+
         for url in website_data.raw_links:
-            is_blocked = self.match_rules.should_block(url)
-            if is_blocked:
-                values.append(url)
+            values += [
+                el.group()
+                for el in self.match_rules.blacklist_re.finditer(url)
+            ]
+            values += [
+                rule
+                for rule in self.match_rules.blacklist_with_options
+                if rule.match_url(url, self.adblockparser_options)
+            ]
+
         return values
 
     def _work_html_content(self, website_data: WebsiteData) -> list:
         values = []
 
         self._logger.info(f"{self.__class__.__name__},{len(self.tag_list)}")
         if self.tag_list:
-            html = "".join(website_data.html)
             if self.extraction_method == ExtractionMethod.MATCH_DIRECTLY:
+                html = "".join(website_data.html)
                 values = [ele for ele in self.tag_list if html.find(ele) >= 0]
             elif self.extraction_method == ExtractionMethod.USE_ADBLOCK_PARSER:
-                values = self._parse_adblock_rules(
-                    website_data=website_data, html=html
-                )
+                values = self._parse_adblock_rules(website_data=website_data)
 
         return values
 

diff --git a/tests/integration/features_integration_test.py b/tests/integration/features_integration_test.py
@@ -0,0 +1,56 @@
+from features.html_based import Advertisement
+from features.website_manager import WebsiteData, WebsiteManager
+from lib.logger import create_logger
+
+# TODO Check other features, e.g. adult:
+# html = {
+#     "html": "9content.com\n,ytimm.com\n,boyzshop.com/affimages/",
+#     "har": "",
+#     "url": "",
+# }
+# expected = {
+#     "easylist_adult": {
+#         "values": ["9content.com", "ad_slot="],
+#         "runs_within": 10,  # time the evaluation may take AT MAX -> acceptance test}
+#     }
+# }
+
+
+def test_advertisement(mocker):
+    _logger = create_logger()
+
+    advertisement = Advertisement(_logger)
+
+    advertisement.setup()
+    website_manager = WebsiteManager.get_instance()
+
+    html = {
+        "html": "ad_block, ad_slot= mallorcash.com admanmedia murkymouse.online",
+        "har": "",
+        "url": "",
+        "headers": "{}",
+    }
+    expected = {
+        "advertisement": {
+            "values": [],  # ["ad_block", "ad_slot="]
+            "runs_within": 10,  # time the evaluation may take AT MAX -> acceptance test
+        },
+    }
+
+    website_manager.load_raw_data(html)
+    website_data = WebsiteData(html=html["html"], raw_header="", headers={})
+    website_data.raw_links = [html["html"]]
+    website_data.html = html["html"]
+
+    data = advertisement.start()
+
+    website_manager.reset()
+
+    assert (
+        data["advertisement"]["values"] == expected["advertisement"]["values"]
+    )
+    runs_fast_enough = (
+        data["advertisement"]["time_required"]
+        <= expected["advertisement"]["runs_within"]
+    )
+    assert runs_fast_enough
diff --git a/tests/unit/metadatabase_test.py b/tests/unit/metadatabase_test.py
@@ -1,4 +1,3 @@
-import asyncio
 from unittest import mock
 
 import adblockparser
@@ -49,7 +48,6 @@ def test_start(metadatabase: MetadataBase, mocker):
     assert values_has_only_one_key
     assert values[metadatabase.key]["values"] == []
 
-    # TODO: An if in a test -> is this a bad idea?
     if "tag_list_last_modified" in values[metadatabase.key].keys():
         assert values[metadatabase.key]["tag_list_last_modified"] == ""
         assert values[metadatabase.key]["tag_list_expires"] == 0