Skip to content

Commit

Permalink
Remove nb_hits update in config file (#48)
Browse files Browse the repository at this point in the history
* Remove custom context factory

Update scrapy to latest version

* Fix Changelog after update

* Remove nb_hits update in config file

* Remove commented out import

* Remove update_nb_hits related logic since is no longer used

* Fix pylint complaint
  • Loading branch information
renehernandez authored Jul 10, 2020
1 parent 089afb4 commit 025f8ed
Show file tree
Hide file tree
Showing 3 changed files with 1 addition and 64 deletions.
17 changes: 1 addition & 16 deletions scraper/src/config/config_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,12 @@
"""

from collections import OrderedDict
from distutils.util import strtobool
import json
import os
import sys
import copy

from .config_validator import ConfigValidator
from .nb_hits_updater import NbHitsUpdater
from .urls_parser import UrlsParser
from .selectors_parser import SelectorsParser
from .browser_handler import BrowserHandler
Expand Down Expand Up @@ -46,7 +44,6 @@ class ConfigLoader:
strategy = 'default'
strict_redirect = True
strip_chars = u".,;:§¶"
update_nb_hits = None
use_anchors = False
user_agent = 'MeiliSearch docs-scraper'
only_content_level = False
Expand Down Expand Up @@ -111,9 +108,7 @@ def _parse(self):
# Parse Env
self.app_id = os.environ.get('MEILISEARCH_HOST_URL', None)
self.api_key = os.environ.get('MEILISEARCH_API_KEY', None)
self.update_nb_hits = os.environ.get('UPDATE_NB_HITS', None)
if self.update_nb_hits is not None:
self.update_nb_hits = bool(strtobool(self.update_nb_hits))

if self.index_uid_tmp is None:
self.index_uid_tmp = os.environ.get('index_uid_TMP', self.index_uid + '_tmp')

Expand All @@ -128,15 +123,5 @@ def _parse(self):
self.allowed_domains = UrlsParser.build_allowed_domains(
self.start_urls, self.stop_urls)

def update_nb_hits_value(self, nb_hits):
if self.config_file is not None:
# config loaded from file
previous_nb_hits = None if 'nb_hits' not in self.config_content else \
self.config_content['nb_hits']
nb_hit_updater = NbHitsUpdater(self.config_file,
self.config_content,
previous_nb_hits, nb_hits)
nb_hit_updater.update(self.update_nb_hits)

def get_extra_facets(self):
return UrlsParser.get_extra_facets(self.start_urls)
47 changes: 0 additions & 47 deletions scraper/src/config/nb_hits_updater.py

This file was deleted.

1 change: 0 additions & 1 deletion scraper/src/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ def run_config(config):
if DocumentationSpider.NB_INDEXED > 0:
# meilisearch_helper.commit_tmp_index()
print('Nb hits: {}'.format(DocumentationSpider.NB_INDEXED))
config.update_nb_hits_value(DocumentationSpider.NB_INDEXED)
else:
print('Crawling issue: nbHits 0 for ' + config.index_uid)
# meilisearch_helper.report_crawling_issue()
Expand Down

0 comments on commit 025f8ed

Please sign in to comment.