diff --git a/.gitignore b/.gitignore index 1b803a5..a68de0c 100644 --- a/.gitignore +++ b/.gitignore @@ -210,3 +210,4 @@ pyrightconfig.json # ignore all vscode, this is not standard configuration in this place .vscode +output \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 343a43e..428a031 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,6 @@ LABEL org.opencontainers.image.source https://github.com/openzim/ifixit # TODO: do we really need all these packages? RUN apt-get update \ && apt-get install -y --no-install-recommends \ - # locales required if tool has any i18n support locales \ locales-all \ libmagic1 \ diff --git a/README.md b/README.md index 9a2eab6..029f93d 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ Docker container as explained below. First, build the Docker image (to be ran in the main folder of this repo): ``` -docker build -t ghcr.io/openzim/ifixit:local . +docker build -t local-ifixit . ``` Then run the scraper with CLI arguments needed for your test (everything after `ifixit2zim` in the example below). @@ -85,7 +85,7 @@ Then run the scraper with CLI arguments needed for your test (everything after ` For instance, if you want to run a scrape of only the `Apple_PDA` category, including its guides, in French : ``` -docker run -it -v $(pwd)/output:/output --rm ghcr.io/openzim/fixit:local ifixit2zim --language fr --output /output --tmp-dir /tmp --category Apple_PDA +docker run -it -v $(pwd)/output:/output --rm local-ifixit ifixit2zim --language fr --output /output --tmp-dir /tmp --category Apple_PDA ``` This will produce a ZIM in the output folder of your current directory. diff --git a/src/ifixit2zim/constants.py b/src/ifixit2zim/constants.py index db52703..52cfa5d 100644 --- a/src/ifixit2zim/constants.py +++ b/src/ifixit2zim/constants.py @@ -144,6 +144,24 @@ # https://www.ifixit.com/Guide/MacBook+Air+11-Inch+Late+2010+Battery+Replacement/4384 # https://www.ifixit.com/Teardown/Apple+Watch+Teardown/40655 +TITLE = { + "en": { + "title_en": "iFixit in English", + "title_fr": "iFixit in French", + "title_pt": "iFixit in Portuguese", + "title_de": "iFixit in German", + "title_ko": "iFixit in Korean", + "title_zh": "iFixit in Chinese", + "title_ru": "iFixit in Russian", + "title_nl": "iFixit in Dutch", + "title_ja": "iFixit in Japanese", + "title_tr": "iFixit in Turkish", + "title_es": "iFixit in Spanish", + "title_it": "iFixit in Italian", + }, + "fr": {"title_fr": "iFixit en Français"}, +} + HOME_LABELS = { "en": {"top_title": "Repair guides for every thing, written by everyone."}, "fr": {"top_title": "Tutoriels de réparation pour tout, écrits par tous."}, @@ -800,7 +818,6 @@ UNAVAILABLE_OFFLINE_INFOS = ["toolkits"] -@dataclass class Configuration: fpath: pathlib.Path @@ -815,14 +832,14 @@ class Configuration: tag: list[str] # filesystem - _output_dir: str # TODO: rename output_name - _tmp_dir: str # IDEM - output_dir: pathlib.Path # TODO: rename output_path - tmp_dir: pathlib.Path # IDEM + _output_name: str + _tmp_name: str + output_path: pathlib.Path + tmp_path: pathlib.Path required = ( "lang_code", - "output_dir", + "output_path", ) lang_code: str @@ -859,35 +876,21 @@ class Configuration: stats_filename: str | None skip_checks: bool - @staticmethod - def get_url(lang_code: str) -> urllib.parse.ParseResult: - return urllib.parse.urlparse(URLS[lang_code]) - - @property - def domain(self) -> str: - return self.main_url.netloc - - @property - def api_url(self) -> str: - return self.main_url.geturl() + API_PREFIX - - @property - def s3_url(self) -> str | None: - return self.s3_url_with_credentials - - def __post_init__(self): + def __init__(self, **kwargs): + for key, value in kwargs.items(): + self.__setattr__(key, value) self.main_url = Configuration.get_url(self.lang_code) self.language = get_language_details(self.lang_code) - self.output_dir = pathlib.Path(self._output_dir).expanduser().resolve() - self.output_dir.mkdir(parents=True, exist_ok=True) + self.output_path = pathlib.Path(self._output_name).expanduser().resolve() + self.output_path.mkdir(parents=True, exist_ok=True) - self.tmp_dir = pathlib.Path(self._tmp_dir).expanduser().resolve() - self.tmp_dir.mkdir(parents=True, exist_ok=True) + self.tmp_path = pathlib.Path(self._tmp_name).expanduser().resolve() + self.tmp_path.mkdir(parents=True, exist_ok=True) if self.build_dir_is_tmp_dir: - self.build_dir = self.tmp_dir + self.build_path = self.tmp_path else: - self.build_dir = pathlib.Path( - tempfile.mkdtemp(prefix=f"ifixit_{self.lang_code}_", dir=self.tmp_dir) + self.build_path = pathlib.Path( + tempfile.mkdtemp(prefix=f"ifixit_{self.lang_code}_", dir=self.tmp_path) ) self.stats_path = None @@ -901,3 +904,19 @@ def __post_init__(self): if ";" in tag: self.tag += [p.strip() for p in tag.split(";")] self.tag.remove(tag) + + @staticmethod + def get_url(lang_code: str) -> urllib.parse.ParseResult: + return urllib.parse.urlparse(URLS[lang_code]) + + @property + def domain(self) -> str: + return self.main_url.netloc + + @property + def api_url(self) -> str: + return self.main_url.geturl() + API_PREFIX + + @property + def s3_url(self) -> str | None: + return self.s3_url_with_credentials diff --git a/src/ifixit2zim/context.py b/src/ifixit2zim/context.py new file mode 100644 index 0000000..01bff73 --- /dev/null +++ b/src/ifixit2zim/context.py @@ -0,0 +1,19 @@ +from ifixit2zim.utils import Utils +import threading +from ifixit2zim.scraper import Configuration +from zimscraperlib.zim.creator import Creator +from typing import Any +from jinja2 import Environment +from ifixit2zim.processor import Processor +from dataclasses import dataclass + + +@dataclass +class Context: + lock: threading.Lock + configuration: Configuration + creator: Creator + utils: Utils + metadata: dict[str, Any] + env: Environment + processor: Processor diff --git a/src/ifixit2zim/entrypoint.py b/src/ifixit2zim/entrypoint.py index 60b617a..245281c 100755 --- a/src/ifixit2zim/entrypoint.py +++ b/src/ifixit2zim/entrypoint.py @@ -26,7 +26,7 @@ def main(): "--output", help="Output folder for ZIM file", default="/output", - dest="_output_dir", + dest="_output_name", ) parser.add_argument( @@ -103,7 +103,7 @@ def main(): "--tmp-dir", help="Path to create temp folder in. Used for building ZIM file.", default=os.getenv("TMPDIR", "."), - dest="_tmp_dir", + dest="_tmp_name", ) parser.add_argument( @@ -274,7 +274,7 @@ def main(): scraper = IFixit2Zim(**dict(args._get_kwargs())) sys.exit(scraper.run()) except Exception as exc: - logger.error(f"FAILED. An error occurred: {exc}") + logger.error(f"FAILED. An error occurred", exc_info=exc) if args.debug: logger.exception(exc) raise SystemExit(1) from None diff --git a/src/ifixit2zim/imager.py b/src/ifixit2zim/imager.py index ca2803c..0dcf4ca 100644 --- a/src/ifixit2zim/imager.py +++ b/src/ifixit2zim/imager.py @@ -13,19 +13,34 @@ from zimscraperlib.image.optimization import optimize_webp from ifixit2zim.constants import IMAGES_ENCODER_VERSION -from ifixit2zim.scraper import IFixit2Zim from ifixit2zim.shared import logger +from ifixit2zim.utils import Utils +import threading +from ifixit2zim.executor import Executor +from ifixit2zim.scraper import Configuration +from zimscraperlib.zim.creator import Creator class Imager: - def __init__(self, scraper: IFixit2Zim): + def __init__( + self, + img_executor: Executor, + lock: threading.Lock, + creator: Creator, + utils: Utils, + configuration: Configuration, + ): self.aborted = False # list of source URLs that we've processed and added to ZIM self.handled = set() self.dedup_items = {} - self.scraper = scraper + self.img_executor = img_executor + self.lock = lock + self.creator = creator + self.utils = utils + self.configuration = configuration - self.scraper.img_executor.start() + self.img_executor.start() def abort(self): """request imager to cancel processing of futures""" @@ -70,7 +85,7 @@ def defer(self, url: str) -> str | None: # find actual URL should it be from a provider try: - parsed_url = urllib.parse.urlparse(self.scraper.utils.to_url(url)) + parsed_url = urllib.parse.urlparse(self.utils.to_url(url)) except Exception: logger.warning(f"Can't parse image URL `{url}`. Skipping") return @@ -89,7 +104,7 @@ def defer(self, url: str) -> str | None: # record that we are processing this one self.handled.add(path) - self.scraper.img_executor.submit( + self.img_executor.submit( self.process_image, url=parsed_url, path=path, @@ -108,22 +123,22 @@ def check_for_duplicate(self, path, content): def add_image_to_zim(self, path, content, mimetype): duplicate_path = self.check_for_duplicate(path, content) - with self.scraper.lock: + with self.lock: if duplicate_path: - self.scraper.creator.add_redirect( + self.creator.add_redirect( path=path, target_path=duplicate_path, ) else: - self.scraper.creator.add_item_for( + self.creator.add_item_for( path=path, content=content, mimetype=mimetype, ) def add_missing_image_to_zim(self, path): - with self.scraper.lock: - self.scraper.creator.add_redirect( + with self.lock: + self.creator.add_redirect( path=path, target_path="assets/NoImage_300x225.jpg", ) @@ -137,7 +152,7 @@ def process_image( return # just download, optimize and add to ZIM if not using S3 - if not self.scraper.configuration.s3_url: + if not self.configuration.s3_url: try: fileobj = self.get_image_data(url.geturl()) except Exception as exc: @@ -159,7 +174,7 @@ def process_image( return path # we are using S3 cache - ident = self.scraper.utils.get_version_ident_for(url.geturl()) + ident = self.utils.get_version_ident_for(url.geturl()) if ident is None: logger.error(f"Unable to query {url.geturl()}. Skipping") self.add_missing_image_to_zim( @@ -168,7 +183,7 @@ def process_image( return path # key = self.get_s3_key_for(url.geturl()) - s3_storage = KiwixStorage(self.scraper.configuration.s3_url) + s3_storage = KiwixStorage(self.configuration.s3_url) meta = {"ident": ident, "encoder_version": str(IMAGES_ENCODER_VERSION)} download_failed = False # useful to trigger reupload or not diff --git a/src/ifixit2zim/processor.py b/src/ifixit2zim/processor.py index afc8193..3991f7a 100644 --- a/src/ifixit2zim/processor.py +++ b/src/ifixit2zim/processor.py @@ -13,18 +13,62 @@ UNAVAILABLE_OFFLINE, ) from ifixit2zim.exceptions import ImageUrlNotFoundError -from ifixit2zim.scraper import IFixit2Zim from ifixit2zim.shared import logger, setlocale +import threading +from ifixit2zim.scraper import Configuration +from ifixit2zim.imager import Imager +from zimscraperlib.zim.creator import Creator class Processor: - def __init__(self, scraper: IFixit2Zim) -> None: - self.scraper = scraper + def __init__( + self, + lock: threading.Lock, + configuration: Configuration, + creator: Creator, + imager: Imager, + ) -> None: self.null_categories = set() self.ifixit_external_content = set() self.final_hrefs = {} + self.lock = lock + self.configuration = configuration + self.creator = creator + self.imager = imager - def guides_in_progress(self, guides, *, in_progress=True): + @property + def get_guide_link_from_props(self): + return self._get_guide_link_from_props + + @get_guide_link_from_props.setter + def get_guide_link_from_props(self, get_guide_link_from_props): + self._get_guide_link_from_props = get_guide_link_from_props + + @property + def get_category_link_from_props(self): + return self._get_category_link_from_props + + @get_category_link_from_props.setter + def get_category_link_from_props(self, get_category_link_from_props): + self._get_category_link_from_props = get_category_link_from_props + + @property + def get_info_link_from_props(self): + return self._get_info_link_from_props + + @get_info_link_from_props.setter + def get_info_link_from_props(self, get_info_link_from_props): + self._get_info_link_from_props = get_info_link_from_props + + @property + def get_user_link_from_props(self): + return self._get_user_link_from_props + + @get_user_link_from_props.setter + def get_user_link_from_props(self, get_user_link_from_props): + self._get_user_link_from_props = get_user_link_from_props + + def guides_in_progress(self, guides, in_progress=True): if in_progress: return [guide for guide in guides if "GUIDE_IN_PROGRESS" in guide["flags"]] return [guide for guide in guides if "GUIDE_IN_PROGRESS" not in guide["flags"]] @@ -42,7 +86,7 @@ def category_count_tools(self, category): return len(category["tools"]) def get_image_path(self, image_url): - return self.scraper.imager.defer(url=image_url) + return self.imager.defer(url=image_url) def _get_image_url_search( self, obj, *, for_guide: bool, for_device: bool, for_wiki: bool, for_user: bool @@ -154,7 +198,7 @@ def _process_href_regex_anchor(self, match): def _process_href_regex_guide(self, rel_prefix, match): if not match.group("guide"): return None - link = self.scraper.get_guide_link_from_props( + link = self.get_guide_link_from_props( guideid=match.group("guideid"), guidetitle=urllib.parse.unquote_plus(match.group("guidetitle")), ) @@ -163,7 +207,7 @@ def _process_href_regex_guide(self, rel_prefix, match): def _process_href_regex_device(self, rel_prefix, match): if not match.group("device"): return None - link = self.scraper.get_category_link_from_props( + link = self.get_category_link_from_props( category_title=urllib.parse.unquote_plus(match.group("devicetitle")) ) return f"{rel_prefix}{link}{match.group('deviceafter') or ''}" @@ -171,7 +215,7 @@ def _process_href_regex_device(self, rel_prefix, match): def _process_href_regex_info(self, rel_prefix, match): if not match.group("info"): return None - link = self.scraper.get_info_link_from_props( + link = self.get_info_link_from_props( info_title=urllib.parse.unquote_plus(match.group("infotitle")) ) return f"{rel_prefix}{link}{match.group('infoafter') or ''}" @@ -179,7 +223,7 @@ def _process_href_regex_info(self, rel_prefix, match): def _process_href_regex_user(self, rel_prefix, match): if not match.group("user"): return None - link = self.scraper.get_user_link_from_props( + link = self.get_user_link_from_props( userid=match.group("userid"), usertitle=urllib.parse.unquote_plus(match.group("usertitle")), ) @@ -229,7 +273,7 @@ def normalize_href(self, href): def _process_href_regex(self, href, rel_prefix): if href.startswith("/"): - href = self.scraper.configuration.main_url.geturl() + href + href = self.configuration.main_url.geturl() + href if href.startswith("http") and "ifixit.com/" in href: href = self.normalize_href(href) href = urllib.parse.quote(href) @@ -303,7 +347,7 @@ def _process_gbl_regex(self, match, rel_prefix): raise Exception("Unsupported match in cleanup_rendered_content") def cleanup_rendered_content(self, content, rel_prefix="../"): - if self.scraper.configuration.no_cleanup: + if self.configuration.no_cleanup: return content return re.sub( self.gbl_regex, @@ -315,9 +359,9 @@ def convert_title_to_filename(self, title): return re.sub(r"\s", "_", title) def add_html_item(self, path, title, content): - with self.scraper.lock: + with self.lock: logger.debug(f"Adding item in ZIM at path '{path}'") - self.scraper.creator.add_item_for( + self.creator.add_item_for( path=path, title=title, content=content, @@ -326,9 +370,9 @@ def add_html_item(self, path, title, content): ) def add_redirect(self, path, target_path): - with self.scraper.lock: + with self.lock: logger.debug(f"Adding redirect in ZIM from '{path}' to '{target_path}'") - self.scraper.creator.add_redirect( + self.creator.add_redirect( path=path, target_path=target_path, ) diff --git a/src/ifixit2zim/scraper.py b/src/ifixit2zim/scraper.py index be62432..027750e 100644 --- a/src/ifixit2zim/scraper.py +++ b/src/ifixit2zim/scraper.py @@ -10,14 +10,19 @@ from zimscraperlib.inputs import compute_descriptions from zimscraperlib.zim.creator import Creator +import io from ifixit2zim.constants import ( DEFAULT_HOMEPAGE, ROOT_DIR, + TITLE, Configuration, ) +from ifixit2zim.exceptions import CategoryHomePageContentError + from ifixit2zim.executor import Executor from ifixit2zim.imager import Imager from ifixit2zim.processor import Processor +from ifixit2zim.context import Context from ifixit2zim.scraper_category import ScraperCategory from ifixit2zim.scraper_guide import ScraperGuide from ifixit2zim.scraper_homepage import ScraperHomepage @@ -36,31 +41,19 @@ def __init__(self, **kwargs): if getattr(self.configuration, option) is None: raise ValueError(f"Missing parameter `{option}`") - self.scraper_homepage = ScraperHomepage(scraper=self) - self.scraper_guide = ScraperGuide(scraper=self) - self.scraper_category = ScraperCategory(scraper=self) - self.scraper_info = ScraperInfo(scraper=self) - self.scraper_user = ScraperUser(scraper=self) - self.scrapers = [ - self.scraper_homepage, - self.scraper_category, - self.scraper_guide, - self.scraper_info, - self.scraper_user, - ] self.lock = threading.Lock() - self.processor = Processor(scraper=self) + self.utils = Utils(configuration=self.configuration) @property - def build_dir(self): - return self.configuration.build_dir + def build_path(self): + return self.configuration.build_path def cleanup(self): """Remove temp files and release resources before exiting""" if not self.configuration.keep_build_dir: - logger.debug(f"Removing {self.build_dir}") - shutil.rmtree(self.build_dir, ignore_errors=True) + logger.debug(f"Removing {self.build_path}") + shutil.rmtree(self.build_path, ignore_errors=True) def sanitize_inputs(self): """input & metadata sanitation""" @@ -95,8 +88,23 @@ def sanitize_inputs(self): f"{self.configuration.name}_{period}.zim" ) - # TODO: fixed title based on defined convention (30 chars only) if not self.configuration.title: + # Try to grab title in selected language, otherwise use title in English + # Logic is a bit complex because we need the title for the selected + # language in the selected language, or fallback to the title for the + # selected language in English. + if ( + self.configuration.lang_code in TITLE + and f"title_{self.configuration.lang_code}" + in TITLE[self.configuration.lang_code] + ): + self.configuration.title = TITLE[self.configuration.lang_code][ + f"title_{self.configuration.lang_code}" + ] + else: + self.configuration.title = TITLE["en"][ + f"title_{self.configuration.lang_code}" + ] self.configuration.title = self.metadata["title"] self.configuration.title = self.configuration.title.strip() @@ -146,21 +154,6 @@ def add_assets(self): with self.lock: self.creator.add_item_for(path=path, fpath=fpath) - def add_illustrations(self): - logger.info("Adding illustrations") - - src_illus_fpath = pathlib.Path(ROOT_DIR.joinpath("assets", "illustration.png")) - tmp_illus_fpath = pathlib.Path(self.build_dir, "illustration.png") - - shutil.copy(src_illus_fpath, tmp_illus_fpath) - - # resize to appropriate size (ZIM uses 48x48 so we double for retina) - for size in (96, 48): - resize_image(tmp_illus_fpath, width=size, height=size, method="thumbnail") - with open(tmp_illus_fpath, "rb") as fh: - with self.lock: - self.creator.add_illustration(size, fh.read()) - def setup(self): # order matters are there are references between them @@ -175,14 +168,22 @@ def setup(self): prefix="IMG-T-", ) - self.imager = Imager(scraper=self) + src_illus_fpath = pathlib.Path(ROOT_DIR.joinpath("assets", "illustration.png")) + dst = io.BytesIO() + resize_image( + src=src_illus_fpath, + dst=dst, + width=48, + height=48, + method="thumbnail", + ) self.creator = Creator( - filename=self.configuration.output_dir / self.configuration.fpath, + filename=self.configuration.output_path / self.configuration.fpath, main_path=DEFAULT_HOMEPAGE, workaround_nocancel=False, ).config_metadata( - Illustration_48x48_at_1=b"illustration", + Illustration_48x48_at_1=dst.getvalue(), Language=self.configuration.language["iso-639-3"], Title=self.configuration.title, Description=self.configuration.description, @@ -193,6 +194,14 @@ def setup(self): Date=datetime.datetime.now(tz=datetime.UTC).date(), ) + self.imager = Imager( + lock=self.lock, + creator=self.creator, + img_executor=self.img_executor, + utils=self.utils, + configuration=self.configuration, + ) + # jinja2 environment setup self.env = Environment( loader=FileSystemLoader(ROOT_DIR.joinpath("templates")), @@ -202,8 +211,73 @@ def setup(self): def _raise_helper(msg): raise Exception(msg) - self.env.globals["raise"] = _raise_helper - self.env.globals["str"] = lambda x: str(x) + self.processor = Processor( + lock=self.lock, + configuration=self.configuration, + creator=self.creator, + imager=self.imager, + ) + + context = Context( + lock=self.lock, + configuration=self.configuration, + creator=self.creator, + utils=self.utils, + metadata=self.metadata, + env=self.env, + processor=self.processor, + ) + + self.scraper_homepage = ScraperHomepage(context=context) + self.scraper_guide = ScraperGuide(context=context) + self.scraper_category = ScraperCategory(context=context) + self.scraper_info = ScraperInfo(context=context) + self.scraper_user = ScraperUser(context=context) + self.scrapers = [ + self.scraper_homepage, + self.scraper_category, + self.scraper_guide, + self.scraper_info, + self.scraper_user, + ] + + self.processor.get_guide_link_from_props = ( + self.scraper_guide.get_guide_link_from_props + ) + self.processor.get_category_link_from_props = ( + self.scraper_category.get_category_link_from_props + ) + self.processor.get_info_link_from_props = ( + self.scraper_info.get_info_link_from_props + ) + self.processor.get_user_link_from_props = ( + self.scraper_user.get_user_link_from_props + ) + + self.env.filters["get_category_link_from_obj"] = ( + self.scraper_category.get_category_link_from_obj + ) + self.env.filters["get_category_link_from_props"] = ( + self.scraper_category.get_category_link_from_props + ) + self.env.filters["get_guide_link_from_obj"] = ( + self.scraper_guide.get_guide_link_from_obj + ) + self.env.filters["get_guide_link_from_props"] = ( + self.scraper_guide.get_guide_link_from_props + ) + self.env.filters["get_info_link_from_obj"] = ( + self.scraper_info.get_info_link_from_obj + ) + self.env.filters["get_info_link_from_props"] = ( + self.scraper_info.get_info_link_from_props + ) + self.env.filters["get_user_link_from_obj"] = ( + self.scraper_user.get_user_link_from_obj + ) + self.env.filters["get_user_link_from_props"] = ( + self.scraper_user.get_user_link_from_props + ) self.env.filters["guides_in_progress"] = self.processor.guides_in_progress self.env.filters["category_count_parts"] = self.processor.category_count_parts self.env.filters["category_count_tools"] = self.processor.category_count_tools @@ -222,6 +296,11 @@ def _raise_helper(msg): self.processor.get_guide_total_comments_count ) self.env.filters["get_user_display_name"] = self.processor.get_user_display_name + self.env.globals["raise"] = _raise_helper + self.env.globals["str"] = lambda x: str(x) + + for scraper in self.scrapers: + scraper.setup() def run(self): # first report => creates a file with appropriate structure @@ -247,12 +326,12 @@ def run(self): f"Starting scraper with:\n" f" language: {self.configuration.language['english']}" f" ({self.configuration.domain})\n" - f" output_dir: {self.configuration.output_dir}\n" - f" build_dir: {self.build_dir}\n" + f" output: {self.configuration.output_path}\n" + f" build: {self.build_path}\n" f"{s3_msg}" ) - self.metadata = self.scraper_homepage.get_online_metadata() + self.metadata = self.get_online_metadata() logger.debug( f"Additional metadata scrapped online:\n" f"title: {self.metadata['title']}\n" @@ -263,43 +342,10 @@ def run(self): logger.debug("Starting Zim creation") self.setup() - self.env.filters["get_category_link_from_obj"] = ( - self.scraper_category.get_category_link_from_obj - ) - self.env.filters["get_category_link_from_props"] = ( - self.scraper_category.get_category_link_from_props - ) - self.env.filters["get_guide_link_from_obj"] = ( - self.scraper_guide.get_guide_link_from_obj - ) - self.env.filters["get_guide_link_from_props"] = ( - self.scraper_guide.get_guide_link_from_props - ) - self.env.filters["get_info_link_from_obj"] = ( - self.scraper_info.get_info_link_from_obj - ) - self.env.filters["get_info_link_from_props"] = ( - self.scraper_info.get_info_link_from_props - ) - self.env.filters["get_user_link_from_obj"] = ( - self.scraper_user.get_user_link_from_obj - ) - self.env.filters["get_user_link_from_props"] = ( - self.scraper_user.get_user_link_from_props - ) - self.get_category_link_from_props = ( - self.scraper_category.get_category_link_from_props - ) - self.get_guide_link_from_props = self.scraper_guide.get_guide_link_from_props - self.get_info_link_from_props = self.scraper_info.get_info_link_from_props - self.get_user_link_from_props = self.scraper_user.get_user_link_from_props - for scraper in self.scrapers: - scraper.setup() self.creator.start() try: self.add_assets() - self.add_illustrations() for scraper in self.scrapers: scraper.build_expected_items() @@ -398,3 +444,58 @@ def report_progress(self): } with open(self.configuration.stats_path, "w") as outfile: json.dump(progress, outfile, indent=2) + + def get_online_metadata(self): + """metadata from online website, looking at homepage source code""" + logger.info("Fetching website metadata") + + soup, _ = self.utils.get_soup("/") + + return { + "title": soup.find( + "title" + ).string, # pyright: ignore[reportAttributeAccessIssue, reportOptionalMemberAccess] + "description": soup.find( + "meta", attrs={"name": "description"} + ).attrs.get( # pyright: ignore[reportAttributeAccessIssue, reportOptionalMemberAccess] + "content" + ), + "stats": self._extract_stats_from_page(soup), + "current_year": datetime.datetime.now(tz=datetime.UTC).year, + } + + def _extract_stats_from_page(self, soup): + results = soup.findAll("div", {"data-name": "KPIDisplay"}) + if len(results) == 0: + raise CategoryHomePageContentError("No KPIs found") + if len(results) > 1: + raise CategoryHomePageContentError("Too many KPIs found") + kpi = results[0].get("data-props") + if kpi is None: + raise CategoryHomePageContentError("KPIs not found in data-props") + + try: + kpi_d = json.loads(kpi) + except json.decoder.JSONDecodeError as e: + raise CategoryHomePageContentError( + f"Failed to decode stats from '{kpi}' to integer" + ) from e + + if "stats" not in kpi_d: + raise CategoryHomePageContentError(f"Stats not found in KPIs '{kpi}'") + + stats = kpi_d["stats"] + + if len(stats) == 0: + raise CategoryHomePageContentError("Stats array is empty") + for stat in stats: + if "value" not in stat: + raise CategoryHomePageContentError( + f"No value found in stat '{json.dumps(stat)}'" + ) + if "label" not in stat: + raise CategoryHomePageContentError( + f"No label found in stat '{json.dumps(stat)}'" + ) + + return stats diff --git a/src/ifixit2zim/scraper_category.py b/src/ifixit2zim/scraper_category.py index 8b0e59f..db4512a 100644 --- a/src/ifixit2zim/scraper_category.py +++ b/src/ifixit2zim/scraper_category.py @@ -2,14 +2,14 @@ from ifixit2zim.constants import CATEGORY_LABELS, URLS from ifixit2zim.exceptions import UnexpectedDataKindExceptionError -from ifixit2zim.scraper import IFixit2Zim +from ifixit2zim.context import Context from ifixit2zim.scraper_generic import ScraperGeneric from ifixit2zim.shared import logger class ScraperCategory(ScraperGeneric): - def __init__(self, scraper: IFixit2Zim): - super().__init__(scraper) + def __init__(self, context: Context): + super().__init__(context) def setup(self): self.category_template = self.env.get_template("category.html") @@ -79,16 +79,14 @@ def build_expected_items(self): self._add_category_to_scrape(category_key, category, True) return logger.info("Downloading list of categories") - categories = self.scraper.utils.get_api_content( - "/categories", includeStubs=True - ) + categories = self.utils.get_api_content("/categories", includeStubs=True) self._process_categories(categories) logger.info(f"{len(self.expected_items_keys)} categories found") def get_one_item_content(self, item_key, item_data): # noqa ARG002 categoryid = item_key - category_content = self.scraper.utils.get_api_content( + category_content = self.utils.get_api_content( f"/wikis/CATEGORY/{categoryid}", langid=self.configuration.lang_code ) @@ -96,7 +94,7 @@ def get_one_item_content(self, item_key, item_data): # noqa ARG002 return category_content logger.warning("Falling back to category in EN") - category_content = self.scraper.utils.get_api_content( + category_content = self.utils.get_api_content( f"/wikis/CATEGORY/{categoryid}", langid="en" ) @@ -105,7 +103,7 @@ def get_one_item_content(self, item_key, item_data): # noqa ARG002 for lang in URLS.keys(): logger.warning(f"Falling back to category in {lang}") - category_content = self.scraper.utils.get_api_content( + category_content = self.utils.get_api_content( f"/wikis/CATEGORY/{categoryid}", langid=lang ) diff --git a/src/ifixit2zim/scraper_generic.py b/src/ifixit2zim/scraper_generic.py index 8f73045..c0fe8ee 100644 --- a/src/ifixit2zim/scraper_generic.py +++ b/src/ifixit2zim/scraper_generic.py @@ -5,15 +5,15 @@ from schedule import run_pending from ifixit2zim.exceptions import FinalScrapingFailureError -from ifixit2zim.scraper import IFixit2Zim +from ifixit2zim.context import Context from ifixit2zim.shared import logger FIRST_ITEMS_COUNT = 5 class ScraperGeneric(ABC): - def __init__(self, scraper: IFixit2Zim): - self.scraper = scraper + def __init__(self, context: Context): + self.context = context self.expected_items_keys = {} self.unexpected_items_keys = {} self.items_queue = Queue() @@ -22,31 +22,31 @@ def __init__(self, scraper: IFixit2Zim): @property def configuration(self): - return self.scraper.configuration + return self.context.configuration @property def utils(self): - return self.scraper.utils + return self.context.utils @property def metadata(self): - return self.scraper.metadata + return self.context.metadata @property def env(self): - return self.scraper.env + return self.context.env @property def lock(self): - return self.scraper.lock + return self.context.lock @property def creator(self): - return self.scraper.creator + return self.context.creator @property def processor(self): - return self.scraper.processor + return self.context.processor @abstractmethod def setup(self): diff --git a/src/ifixit2zim/scraper_guide.py b/src/ifixit2zim/scraper_guide.py index 21b3747..a674fba 100644 --- a/src/ifixit2zim/scraper_guide.py +++ b/src/ifixit2zim/scraper_guide.py @@ -11,14 +11,14 @@ UNKNOWN_TITLE, ) from ifixit2zim.exceptions import UnexpectedDataKindExceptionError -from ifixit2zim.scraper import IFixit2Zim +from ifixit2zim.context import Context from ifixit2zim.scraper_generic import ScraperGeneric from ifixit2zim.shared import logger class ScraperGuide(ScraperGeneric): - def __init__(self, scraper: IFixit2Zim): - super().__init__(scraper) + def __init__(self, context: Context): + super().__init__(context) def setup(self): self.guide_template = self.env.get_template("guide.html") @@ -100,9 +100,7 @@ def build_expected_items(self): limit = 200 offset = 0 while True: - guides = self.scraper.utils.get_api_content( - "/guides", limit=limit, offset=offset - ) + guides = self.utils.get_api_content("/guides", limit=limit, offset=offset) if not guides or len(guides) == 0: break for guide in guides: @@ -133,12 +131,10 @@ def get_one_item_content(self, item_key, item_data): if locale == "ja": locale = "jp" # Unusual iFixit convention - guide_content = self.scraper.utils.get_api_content( - f"/guides/{guideid}", langid=locale - ) + guide_content = self.utils.get_api_content(f"/guides/{guideid}", langid=locale) if guide_content is None and locale != "en": # guide is most probably available in English anyway - guide_content = self.scraper.utils.get_api_content( + guide_content = self.utils.get_api_content( f"/guides/{guideid}", langid="en" ) diff --git a/src/ifixit2zim/scraper_homepage.py b/src/ifixit2zim/scraper_homepage.py index 7a38afd..fd1ec88 100644 --- a/src/ifixit2zim/scraper_homepage.py +++ b/src/ifixit2zim/scraper_homepage.py @@ -4,14 +4,14 @@ from ifixit2zim.constants import DEFAULT_HOMEPAGE, HOME_LABELS from ifixit2zim.exceptions import CategoryHomePageContentError -from ifixit2zim.scraper import IFixit2Zim +from ifixit2zim.context import Context from ifixit2zim.scraper_generic import ScraperGeneric from ifixit2zim.shared import logger class ScraperHomepage(ScraperGeneric): - def __init__(self, scraper: IFixit2Zim): - super().__init__(scraper) + def __init__(self, context: Context): + super().__init__(context) def setup(self): self.homepage_template = self.env.get_template("home.html") @@ -24,7 +24,7 @@ def build_expected_items(self): self.add_item_to_scrape(1, 1, True) def get_one_item_content(self, item_key, item_data): # noqa ARG002 - soup, _ = self.scraper.utils.get_soup("/Guide") + soup, _ = self.utils.get_soup("/Guide") return soup def add_item_redirect(self, item_key, item_data, redirect_kind): # noqa ARG002 @@ -422,42 +422,6 @@ def _extract_title_from_sub_category(self, sc): ) return title - def _extract_stats_from_page(self, soup): - results = soup.findAll("div", {"data-name": "KPIDisplay"}) - if len(results) == 0: - raise CategoryHomePageContentError("No KPIs found") - if len(results) > 1: - raise CategoryHomePageContentError("Too many KPIs found") - kpi = results[0].get("data-props") - if kpi is None: - raise CategoryHomePageContentError("KPIs not found in data-props") - - try: - kpi_d = json.loads(kpi) - except json.decoder.JSONDecodeError as e: - raise CategoryHomePageContentError( - f"Failed to decode stats from '{kpi}' to integer" - ) from e - - if "stats" not in kpi_d: - raise CategoryHomePageContentError(f"Stats not found in KPIs '{kpi}'") - - stats = kpi_d["stats"] - - if len(stats) == 0: - raise CategoryHomePageContentError("Stats array is empty") - for stat in stats: - if "value" not in stat: - raise CategoryHomePageContentError( - f"No value found in stat '{json.dumps(stat)}'" - ) - if "label" not in stat: - raise CategoryHomePageContentError( - f"No label found in stat '{json.dumps(stat)}'" - ) - - return stats - def _extract_details_from_single_stat(self, fs): stat_text_css_selector = "chakra-stat__help-text" p = fs.select(stat_text_css_selector) @@ -507,22 +471,3 @@ def _extract_details_from_single_stat(self, fs): raise CategoryHomePageContentError( f"Failed to convert text '{stat_number}' to integer for stat" ) from None - - def get_online_metadata(self): - """metadata from online website, looking at homepage source code""" - logger.info("Fetching website metadata") - - soup, _ = self.scraper.utils.get_soup("/") - - return { - "title": soup.find( - "title" - ).string, # pyright: ignore[reportAttributeAccessIssue, reportOptionalMemberAccess] - "description": soup.find( - "meta", attrs={"name": "description"} - ).attrs.get( # pyright: ignore[reportAttributeAccessIssue, reportOptionalMemberAccess] - "content" - ), - "stats": self._extract_stats_from_page(soup), - "current_year": datetime.datetime.now(tz=datetime.UTC).year, - } diff --git a/src/ifixit2zim/scraper_info.py b/src/ifixit2zim/scraper_info.py index 5254d7f..eed0427 100644 --- a/src/ifixit2zim/scraper_info.py +++ b/src/ifixit2zim/scraper_info.py @@ -2,14 +2,14 @@ from ifixit2zim.constants import UNAVAILABLE_OFFLINE_INFOS from ifixit2zim.exceptions import UnexpectedDataKindExceptionError -from ifixit2zim.scraper import IFixit2Zim +from ifixit2zim.context import Context from ifixit2zim.scraper_generic import ScraperGeneric from ifixit2zim.shared import logger class ScraperInfo(ScraperGeneric): - def __init__(self, scraper: IFixit2Zim): - super().__init__(scraper) + def __init__(self, context: Context): + super().__init__(context) def setup(self): self.info_template = self.env.get_template("info.html") @@ -77,7 +77,7 @@ def build_expected_items(self): limit = 200 offset = 0 while True: - info_wikis = self.scraper.utils.get_api_content( + info_wikis = self.utils.get_api_content( "/wikis/INFO", limit=limit, offset=offset ) if not info_wikis or len(info_wikis) == 0: @@ -97,9 +97,7 @@ def build_expected_items(self): def get_one_item_content(self, item_key, item_data): # noqa ARG002 info_wiki_title = item_key - info_wiki_content = self.scraper.utils.get_api_content( - f"/wikis/INFO/{info_wiki_title}" - ) + info_wiki_content = self.utils.get_api_content(f"/wikis/INFO/{info_wiki_title}") return info_wiki_content def add_item_redirect(self, item_key, item_data, redirect_kind): # noqa ARG002 diff --git a/src/ifixit2zim/scraper_user.py b/src/ifixit2zim/scraper_user.py index faff008..a5c0dce 100644 --- a/src/ifixit2zim/scraper_user.py +++ b/src/ifixit2zim/scraper_user.py @@ -2,14 +2,14 @@ from ifixit2zim.constants import UNKNOWN_TITLE, USER_LABELS from ifixit2zim.exceptions import UnexpectedDataKindExceptionError -from ifixit2zim.scraper import IFixit2Zim +from ifixit2zim.context import Context from ifixit2zim.scraper_generic import ScraperGeneric from ifixit2zim.shared import logger class ScraperUser(ScraperGeneric): - def __init__(self, scraper: IFixit2Zim): - super().__init__(scraper) + def __init__(self, context: Context): + super().__init__(context) self.user_id_to_titles = {} def setup(self): diff --git a/src/ifixit2zim/utils.py b/src/ifixit2zim/utils.py index 6a33bf7..d2e6871 100644 --- a/src/ifixit2zim/utils.py +++ b/src/ifixit2zim/utils.py @@ -15,7 +15,7 @@ from ifixit2zim.shared import logger -def __backoff_hdlr(details): +def backoff_hdlr(details): logger.warning( "Backing off {wait:0.1f} seconds after {tries} tries " "calling function {target} with args {args} and kwargs " @@ -153,7 +153,7 @@ def setup_s3_and_check_credentials(self, s3_url_with_credentials): backoff.expo, requests.exceptions.RequestException, max_time=16, - on_backoff=__backoff_hdlr, + on_backoff=backoff_hdlr, ) def get_api_content(self, path, **params): full_path = self.get_url(API_PREFIX + path, **params)