diff --git a/Pipfile b/Pipfile index eb13315..a767d1a 100644 --- a/Pipfile +++ b/Pipfile @@ -4,7 +4,7 @@ verify_ssl = true name = "pypi" [packages] -Scrapy = "==2.2.1" +Scrapy = "==2.3.0" selenium = "==3.141.0" pytest = "==6.0.0" meilisearch = "==0.12.3" diff --git a/Pipfile.lock b/Pipfile.lock index 198b085..8050f9b 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "a070ede06a8e51e17e2b993a69a413c28e7aaa9e82ea95ee77e455911bbd47c3" + "sha256": "9b5b87fc284e2472e9f61023ae7b582a675f79f78b4034cb2ebc0dd4b98813de" }, "pipfile-spec": 6, "requires": { @@ -157,6 +157,22 @@ ], "version": "==0.1.0" }, + "itemloaders": { + "hashes": [ + "sha256:a7803a1c27177d73329a0cc83a9c10de50fa4d4f37970e0194e8ae24b1fb7066", + "sha256:d8f92a93d0cc9f5a7f72f01562539cb7030f7741758e764a0a8716f9b0210f7a" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.3" + }, + "jmespath": { + "hashes": [ + "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9", + "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f" + ], + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==0.10.0" + }, "lxml": { "hashes": [ "sha256:05a444b207901a68a6526948c7cc8f9fe6d6f24c70781488e32fd74ff5996e3f", @@ -326,7 +342,7 @@ "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1", "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b" ], - "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'", + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.4.7" }, "pytest": { @@ -362,11 +378,11 @@ }, "scrapy": { "hashes": [ - "sha256:6a09beb5190bfdee2d72cf261822eae5d92fe8a86ac9ee1f55fc44b4864ca583", - "sha256:d9d898739f199bd9f9e2258770d5bfeeb754b6ed4eb84a41c04fd52e9649266d" + "sha256:31a9807f8771bfa33693aae9a5da46b4cfc8362dee11dff811574b760cc2749d", + "sha256:b4d08cdacb615563c291d053ef1ba2dc08d9d4b6d81578684eaa1cf7b832f90c" ], "index": "pypi", - "version": "==2.2.1" + "version": "==2.3.0" }, "selenium": { "hashes": [ @@ -388,7 +404,7 @@ "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259", "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.15.0" }, "toml": { @@ -553,7 +569,7 @@ "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259", "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.15.0" }, "toml": { diff --git a/scraper/src/documentation_spider.py b/scraper/src/documentation_spider.py index 651cd04..58c3eb8 100644 --- a/scraper/src/documentation_spider.py +++ b/scraper/src/documentation_spider.py @@ -145,6 +145,9 @@ def start_requests(self): }, errback=self.errback_alternative_link) + def parse(self, response, **kwargs): + return super()._parse(response, **kwargs) + def add_records(self, response, from_sitemap): records = self.strategy.get_records_from_response(response) self.meilisearch_helper.add_records(records, response.url, from_sitemap) @@ -176,7 +179,6 @@ def parse_from_start_url(self, response): if self.is_rules_compliant(response): self.add_records(response, from_sitemap=False) - else: print("\033[94m> Ignored: from start url\033[0m " + response.url)