Skip to content

Commit

Permalink
Bug list checking (#39)
Browse files Browse the repository at this point in the history
* - Bug fixing

* - Performance optimization adblockparser

* - Correcting tag list urls
-

* - WIP: Replace asyncio with multiprocessing

* - Resolved bug of missing logger

* - Replace asyncio with multiprocessing

* - Cleanup

* - Cleanup
- Making long running adblockparsing processes async and parallel

* - Implementing _astart for html based features

* - Removing unneeded allowlists
- Reducing number of rools per pool

* - Test coverage

* - WIP: Building tests

* - WIP: Thoughts

* - Reverting parallel processing for simplification

* - Adding advertisment feature integration test
  • Loading branch information
RobertMeissner authored Jan 4, 2021
1 parent 0a73de4 commit 6b1aaf6
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 18 deletions.
35 changes: 19 additions & 16 deletions src/features/metadata_base.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import asyncio
import multiprocessing
import os
import re
from collections import OrderedDict
from enum import Enum
from itertools import repeat
from urllib.parse import urlparse

import adblockparser
Expand Down Expand Up @@ -34,11 +32,11 @@ class MetadataBaseException(Exception):
pass


def _parallel_rule_matching(rules, html, options):
return [rule for rule in rules if rule.match_url(html, options)]


class MetadataBase:
"""
Base class for features to be extracted.
"""

tag_list: list = []
tag_list_last_modified = ""
tag_list_expires: int = 0
Expand Down Expand Up @@ -202,28 +200,33 @@ def _work_header(self, header):
def _extract_raw_links(soup: BeautifulSoup) -> list:
return list({a["href"] for a in soup.find_all(href=True)})

def _parse_adblock_rules(
self, website_data: WebsiteData, html: str
) -> list:
def _parse_adblock_rules(self, website_data: WebsiteData) -> list:
values = []
self.adblockparser_options["domain"] = website_data.top_level_domain

for url in website_data.raw_links:
is_blocked = self.match_rules.should_block(url)
if is_blocked:
values.append(url)
values += [
el.group()
for el in self.match_rules.blacklist_re.finditer(url)
]
values += [
rule
for rule in self.match_rules.blacklist_with_options
if rule.match_url(url, self.adblockparser_options)
]

return values

def _work_html_content(self, website_data: WebsiteData) -> list:
values = []

self._logger.info(f"{self.__class__.__name__},{len(self.tag_list)}")
if self.tag_list:
html = "".join(website_data.html)
if self.extraction_method == ExtractionMethod.MATCH_DIRECTLY:
html = "".join(website_data.html)
values = [ele for ele in self.tag_list if html.find(ele) >= 0]
elif self.extraction_method == ExtractionMethod.USE_ADBLOCK_PARSER:
values = self._parse_adblock_rules(
website_data=website_data, html=html
)
values = self._parse_adblock_rules(website_data=website_data)

return values

Expand Down
56 changes: 56 additions & 0 deletions tests/integration/features_integration_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from features.html_based import Advertisement
from features.website_manager import WebsiteData, WebsiteManager
from lib.logger import create_logger

# TODO Check other features, e.g. adult:
# html = {
# "html": "9content.com\n,ytimm.com\n,boyzshop.com/affimages/",
# "har": "",
# "url": "",
# }
# expected = {
# "easylist_adult": {
# "values": ["9content.com", "ad_slot="],
# "runs_within": 10, # time the evaluation may take AT MAX -> acceptance test}
# }
# }


def test_advertisement(mocker):
_logger = create_logger()

advertisement = Advertisement(_logger)

advertisement.setup()
website_manager = WebsiteManager.get_instance()

html = {
"html": "ad_block, ad_slot= mallorcash.com admanmedia murkymouse.online",
"har": "",
"url": "",
"headers": "{}",
}
expected = {
"advertisement": {
"values": [], # ["ad_block", "ad_slot="]
"runs_within": 10, # time the evaluation may take AT MAX -> acceptance test
},
}

website_manager.load_raw_data(html)
website_data = WebsiteData(html=html["html"], raw_header="", headers={})
website_data.raw_links = [html["html"]]
website_data.html = html["html"]

data = advertisement.start()

website_manager.reset()

assert (
data["advertisement"]["values"] == expected["advertisement"]["values"]
)
runs_fast_enough = (
data["advertisement"]["time_required"]
<= expected["advertisement"]["runs_within"]
)
assert runs_fast_enough
2 changes: 0 additions & 2 deletions tests/unit/metadatabase_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import asyncio
from unittest import mock

import adblockparser
Expand Down Expand Up @@ -49,7 +48,6 @@ def test_start(metadatabase: MetadataBase, mocker):
assert values_has_only_one_key
assert values[metadatabase.key]["values"] == []

# TODO: An if in a test -> is this a bad idea?
if "tag_list_last_modified" in values[metadatabase.key].keys():
assert values[metadatabase.key]["tag_list_last_modified"] == ""
assert values[metadatabase.key]["tag_list_expires"] == 0
Expand Down

0 comments on commit 6b1aaf6

Please sign in to comment.