From 869b76ce403939bc8e7e9faf1579cf0dbba127e4 Mon Sep 17 00:00:00 2001 From: zerty Date: Sat, 1 Jul 2023 20:17:59 +0200 Subject: [PATCH 1/8] Rewrite for ranobes.top using browser With search Issues: #1590 #1965 --- sources/en/r/ranobes.py | 177 +++++++++++++++++++--------------------- 1 file changed, 86 insertions(+), 91 deletions(-) diff --git a/sources/en/r/ranobes.py b/sources/en/r/ranobes.py index 7ea2aab0d..7258e2582 100644 --- a/sources/en/r/ranobes.py +++ b/sources/en/r/ranobes.py @@ -1,109 +1,104 @@ # -*- coding: utf-8 -*- import logging +import re -import js2py -from bs4.element import Tag +from typing import Generator, Union -from lncrawl.core.crawler import Crawler +from bs4 import BeautifulSoup, Tag + +from lncrawl.models import Chapter, SearchResult, Volume +from lncrawl.templates.browser.searchable import SearchableBrowserTemplate +from lncrawl.core.exeptions import FallbackToBrowser + +from urllib.parse import urljoin, quote_plus logger = logging.getLogger(__name__) -class RanobeLibCrawler(Crawler): +digit_regex = re.compile(r"\/(\d+)-") + + +class RanobeLibCrawler(SearchableBrowserTemplate): base_url = [ - "http://ranobes.net/", - "https://ranobes.net/", + "https://ranobes.top/", ] + has_manga = False + has_mtl = False def initialize(self) -> None: - self.init_executor(1) self.cleaner.bad_css.update([".free-support", 'div[id^="adfox_"]']) - def read_novel_info(self): - soup = self.get_soup(self.novel_url) - - main_page_link = soup.select_one("#mainside, .breadcrumbs-panel") - if isinstance(main_page_link, Tag): - main_page_link = main_page_link.select_one('a[href*="/novels/"]') - if isinstance(main_page_link, Tag): - self.novel_url = self.absolute_url(main_page_link["href"]) - logger.info("Visiting %s", self.novel_url) - soup = self.get_soup(self.novel_url) - - possible_title = soup.select_one('meta[property="og:title"]') - assert isinstance(possible_title, Tag) - self.novel_title = possible_title["content"] - logger.info("Novel title: %s", self.novel_title) - - possible_image = soup.select_one('meta[property="og:image"]') - if isinstance(possible_image, Tag): - self.novel_cover = self.absolute_url(possible_image["content"]) - logger.info("Novel cover: %s", self.novel_cover) - - author_link = soup.select_one('.tag_list a[href*="/authors/"]') - if isinstance(author_link, Tag): - self.novel_author = author_link.text.strip().title() - logger.info("Novel author: %s", self.novel_author) - - chapter_list_link = soup.select_one( - '#fs-chapters a[title="Go to table of contents"]' - ) - assert isinstance(chapter_list_link, Tag) - chapter_list_link = self.absolute_url(chapter_list_link["href"]) + def select_search_items_in_browser(self, query: str) -> Generator[Tag, None, None]: + self.visit(urljoin(self.home_url, "/search/{}/".format(quote_plus(query)))) + self.browser.wait(".breadcrumbs-panel") + for elem in self.browser.find_all(".short-cont .title a"): + yield elem.as_tag() + + def select_search_items(self, query: str) -> Generator[Tag, None, None]: + raise FallbackToBrowser() - logger.info("Visiting %s", chapter_list_link) - soup = self.get_soup(chapter_list_link) + def parse_search_item(self, tag: Tag) -> SearchResult: + return SearchResult( + title=tag.text.strip(), + url=self.absolute_url(tag["href"]), + ) - script = soup.find( - lambda tag: isinstance(tag, Tag) - and tag.name == "script" - and tag.text.startswith("window.__DATA__") + def visit_novel_page_in_browser(self) -> BeautifulSoup: + self.visit(self.novel_url) + self.browser.wait(".body_left_in") + self.novel_id = digit_regex.search(self.novel_url).group(1) + + def parse_title(self, soup: BeautifulSoup) -> str: + tag = soup.select_one("h1.title") + assert tag + return tag.text.strip() + + def parse_cover(self, soup: BeautifulSoup) -> str: + tag = soup.select_one(".r-fullstory-poster .poster a img") + assert tag + if tag.has_attr("data-src"): + return self.absolute_url(tag["data-src"]) + if tag.has_attr("src"): + return self.absolute_url(tag["src"]) + + def parse_authors(self, soup: BeautifulSoup) -> Generator[str, None, None]: + for a in soup.select('.tag_list a[href*="/authors/"]'): + yield a.text.strip() + + def parse_chapter_list_in_browser( + self, + ) -> Generator[Union[Chapter, Volume], None, None]: + self.browser.visit(urljoin(self.home_url, f"/chapters/{self.novel_id}/")) + self.browser.wait(".chapters__container") + _pages = max( + int(a["value"]) for a in self.browser.soup.select(".form_submit option") ) - assert isinstance(script, Tag) - - data = js2py.eval_js(script.text).to_dict() - assert isinstance(data, dict) - - pages_count = data["pages_count"] - logger.info("Total pages: %d", pages_count) - - futures = [] - page_soups = [soup] - for i in range(2, pages_count + 1): - chapter_page_url = chapter_list_link.strip("/") + ("/page/%d" % i) - f = self.executor.submit(self.get_soup, chapter_page_url) - futures.append(f) - page_soups += [f.result() for f in futures] - - volumes = set([]) - for soup in reversed(page_soups): - script = soup.find( - lambda tag: isinstance(tag, Tag) - and tag.name == "script" - and tag.text.startswith("window.__DATA__") + if not _pages: + _page = 1 + tags = self.browser.soup.select(".chapters__container .cat_line a") + for i in range(2, _pages + 1): + self.browser.visit( + urljoin(self.home_url, f"/chapters/{self.novel_id}/page/{i}/") ) - assert isinstance(script, Tag) - - data = js2py.eval_js(script.text).to_dict() - assert isinstance(data, dict) - - for chapter in reversed(data["chapters"]): - chap_id = len(self.chapters) + 1 - vol_id = len(self.chapters) // 100 + 1 - volumes.add(vol_id) - self.chapters.append( - { - "id": chap_id, - "volume": vol_id, - "title": chapter["title"], - "url": "https://ranobes.net/read-%s.html" % chapter["id"], - } - ) - - self.volumes = [{"id": x} for x in volumes] - - def download_chapter_body(self, chapter): - soup = self.get_soup(chapter["url"]) - article = soup.select_one('.text[itemprop="description"]') - self.cleaner.clean_contents(article) - return str(article) + self.browser.wait(".chapters__container") + tags += self.browser.soup.select(".chapters__container .cat_line a") + + for _id, _t in enumerate(reversed(tags)): + yield Chapter( + id=_id, url=self.absolute_url(_t.get("href")), title=_t.get("title") + ) + + def parse_chapter_list( + self, soup: BeautifulSoup + ) -> Generator[Union[Chapter, Volume], None, None]: + pass + + def visit_chapter_page_in_browser(self, chapter: Chapter) -> None: + self.visit(chapter.url) + self.browser.wait(".structure") + + def select_chapter_body(self, soup: BeautifulSoup) -> Tag: + return soup.select_one("div#arrticle") + + def download_chapter_body_in_scraper(self, chapter: Chapter) -> str: + raise FallbackToBrowser() From f55e0913154fd61313065b2d32a7afd53ab02639 Mon Sep 17 00:00:00 2001 From: zerty Date: Sat, 1 Jul 2023 20:20:32 +0200 Subject: [PATCH 2/8] Small error fix --- sources/en/r/ranobes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/en/r/ranobes.py b/sources/en/r/ranobes.py index 7258e2582..19c7d618f 100644 --- a/sources/en/r/ranobes.py +++ b/sources/en/r/ranobes.py @@ -74,7 +74,7 @@ def parse_chapter_list_in_browser( int(a["value"]) for a in self.browser.soup.select(".form_submit option") ) if not _pages: - _page = 1 + _pages = 1 tags = self.browser.soup.select(".chapters__container .cat_line a") for i in range(2, _pages + 1): self.browser.visit( From 228bd72e44b0ff103933a856412b1628b8372b74 Mon Sep 17 00:00:00 2001 From: zerty Date: Sat, 1 Jul 2023 20:58:28 +0200 Subject: [PATCH 3/8] Prevent seach hanging for ranobes --- sources/en/r/ranobes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/en/r/ranobes.py b/sources/en/r/ranobes.py index 19c7d618f..eddfd87de 100644 --- a/sources/en/r/ranobes.py +++ b/sources/en/r/ranobes.py @@ -31,7 +31,7 @@ def initialize(self) -> None: def select_search_items_in_browser(self, query: str) -> Generator[Tag, None, None]: self.visit(urljoin(self.home_url, "/search/{}/".format(quote_plus(query)))) self.browser.wait(".breadcrumbs-panel") - for elem in self.browser.find_all(".short-cont .title a"): + for elem in self.browser.select(".short-cont .title a"): yield elem.as_tag() def select_search_items(self, query: str) -> Generator[Tag, None, None]: From 68f7c130111092eadd3ac6745a9f1ce8a8ad4685 Mon Sep 17 00:00:00 2001 From: zerty Date: Wed, 5 Jul 2023 19:50:44 +0200 Subject: [PATCH 4/8] Update Ranobes --- sources/en/r/ranobes.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/sources/en/r/ranobes.py b/sources/en/r/ranobes.py index eddfd87de..cac011f7c 100644 --- a/sources/en/r/ranobes.py +++ b/sources/en/r/ranobes.py @@ -21,6 +21,9 @@ class RanobeLibCrawler(SearchableBrowserTemplate): base_url = [ "https://ranobes.top/", + "http://ranobes.top/", + "https://ranobes.net/", + "http://ranobes.net/", ] has_manga = False has_mtl = False @@ -32,10 +35,15 @@ def select_search_items_in_browser(self, query: str) -> Generator[Tag, None, Non self.visit(urljoin(self.home_url, "/search/{}/".format(quote_plus(query)))) self.browser.wait(".breadcrumbs-panel") for elem in self.browser.select(".short-cont .title a"): - yield elem.as_tag() + yield elem def select_search_items(self, query: str) -> Generator[Tag, None, None]: - raise FallbackToBrowser() + soup = self.get_soup( + urljoin(self.home_url, "/search/{}/".format(quote_plus(query))) + ) + + for elem in soup.select(".short-cont .title a"): + yield elem def parse_search_item(self, tag: Tag) -> SearchResult: return SearchResult( @@ -91,7 +99,22 @@ def parse_chapter_list_in_browser( def parse_chapter_list( self, soup: BeautifulSoup ) -> Generator[Union[Chapter, Volume], None, None]: - pass + _pages = max( + int(a["value"]) for a in soup.select(".form_submit option") + ) + if not _pages: + _pages = 1 + tags = soup.select(".chapters__container .cat_line a") + for i in range(2, _pages + 1): + soup=self.get_soup( + urljoin(self.home_url, f"/chapters/{self.novel_id}/page/{i}/") + ) + tags += soup.select(".chapters__container .cat_line a") + + for _id, _t in enumerate(reversed(tags)): + yield Chapter( + id=_id, url=self.absolute_url(_t.get("href")), title=_t.get("title") + ) def visit_chapter_page_in_browser(self, chapter: Chapter) -> None: self.visit(chapter.url) From ecd5507863c3e86102948fecf42d12746606f078 Mon Sep 17 00:00:00 2001 From: zerty Date: Wed, 5 Jul 2023 19:52:36 +0200 Subject: [PATCH 5/8] Formating File for ranobes --- sources/en/r/ranobes.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sources/en/r/ranobes.py b/sources/en/r/ranobes.py index cac011f7c..e5c06a416 100644 --- a/sources/en/r/ranobes.py +++ b/sources/en/r/ranobes.py @@ -99,14 +99,12 @@ def parse_chapter_list_in_browser( def parse_chapter_list( self, soup: BeautifulSoup ) -> Generator[Union[Chapter, Volume], None, None]: - _pages = max( - int(a["value"]) for a in soup.select(".form_submit option") - ) + _pages = max(int(a["value"]) for a in soup.select(".form_submit option")) if not _pages: _pages = 1 tags = soup.select(".chapters__container .cat_line a") for i in range(2, _pages + 1): - soup=self.get_soup( + soup = self.get_soup( urljoin(self.home_url, f"/chapters/{self.novel_id}/page/{i}/") ) tags += soup.select(".chapters__container .cat_line a") From 51ecaf1639066e42802c2bd8c28b392d8db60054 Mon Sep 17 00:00:00 2001 From: Zerty <4232921+zerty@users.noreply.github.com> Date: Thu, 6 Jul 2023 14:58:15 +0200 Subject: [PATCH 6/8] Revert to old method in ranobes for colab --- sources/en/r/ranobes.py | 60 ++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 16 deletions(-) diff --git a/sources/en/r/ranobes.py b/sources/en/r/ranobes.py index e5c06a416..42f879b65 100644 --- a/sources/en/r/ranobes.py +++ b/sources/en/r/ranobes.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import logging import re - +import js2py from typing import Generator, Union from bs4 import BeautifulSoup, Tag @@ -99,20 +99,50 @@ def parse_chapter_list_in_browser( def parse_chapter_list( self, soup: BeautifulSoup ) -> Generator[Union[Chapter, Volume], None, None]: - _pages = max(int(a["value"]) for a in soup.select(".form_submit option")) - if not _pages: - _pages = 1 - tags = soup.select(".chapters__container .cat_line a") - for i in range(2, _pages + 1): - soup = self.get_soup( - urljoin(self.home_url, f"/chapters/{self.novel_id}/page/{i}/") - ) - tags += soup.select(".chapters__container .cat_line a") - - for _id, _t in enumerate(reversed(tags)): - yield Chapter( - id=_id, url=self.absolute_url(_t.get("href")), title=_t.get("title") + self.novel_id = digit_regex.search(self.novel_url).group(1) + chapter_list_link=urljoin(self.home_url, f"/chapters/{self.novel_id}/") + soup=self.get_soup(chapter_list_link) + script = soup.find( + + lambda tag: isinstance(tag, Tag) + and tag.name == "script" + and tag.text.startswith("window.__DATA__") + ) + assert isinstance(script, Tag) + data = js2py.eval_js(script.text).to_dict() + assert isinstance(data, dict) + + pages_count = data["pages_count"] + logger.info("Total pages: %d", pages_count) + + futures = [] + page_soups = [soup] + for i in range(2, pages_count + 1): + chapter_page_url = chapter_list_link.strip("/") + ("/page/%d" % i) + f = self.executor.submit(self.get_soup, chapter_page_url) + futures.append(f) + page_soups += [f.result() for f in futures] + + + _i=0 + for soup in reversed(page_soups): + script = soup.find( + lambda tag: isinstance(tag, Tag) + and tag.name == "script" + and tag.text.startswith("window.__DATA__") ) + assert isinstance(script, Tag) + + data = js2py.eval_js(script.text).to_dict() + assert isinstance(data, dict) + + for chapter in reversed(data["chapters"]): + _i+=1 + yield Chapter( + id = _i, + title = chapter["title"], + url = self.absolute_url(chapter["link"]), + ) def visit_chapter_page_in_browser(self, chapter: Chapter) -> None: self.visit(chapter.url) @@ -121,5 +151,3 @@ def visit_chapter_page_in_browser(self, chapter: Chapter) -> None: def select_chapter_body(self, soup: BeautifulSoup) -> Tag: return soup.select_one("div#arrticle") - def download_chapter_body_in_scraper(self, chapter: Chapter) -> str: - raise FallbackToBrowser() From 36d3a09086d6411a5b1c9b5eddcf849e93dee0db Mon Sep 17 00:00:00 2001 From: Zerty <4232921+zerty@users.noreply.github.com> Date: Thu, 6 Jul 2023 14:59:44 +0200 Subject: [PATCH 7/8] Mall fix Ranobes --- sources/en/r/ranobes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sources/en/r/ranobes.py b/sources/en/r/ranobes.py index 42f879b65..bff923927 100644 --- a/sources/en/r/ranobes.py +++ b/sources/en/r/ranobes.py @@ -8,7 +8,6 @@ from lncrawl.models import Chapter, SearchResult, Volume from lncrawl.templates.browser.searchable import SearchableBrowserTemplate -from lncrawl.core.exeptions import FallbackToBrowser from urllib.parse import urljoin, quote_plus From b93ea782d860bfb3fe53f3759df960ff9ca200cc Mon Sep 17 00:00:00 2001 From: Zerty <4232921+zerty@users.noreply.github.com> Date: Thu, 6 Jul 2023 15:09:13 +0200 Subject: [PATCH 8/8] Fixes for Flake8 checks --- sources/en/r/ranobes.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/sources/en/r/ranobes.py b/sources/en/r/ranobes.py index bff923927..8467cb694 100644 --- a/sources/en/r/ranobes.py +++ b/sources/en/r/ranobes.py @@ -99,10 +99,9 @@ def parse_chapter_list( self, soup: BeautifulSoup ) -> Generator[Union[Chapter, Volume], None, None]: self.novel_id = digit_regex.search(self.novel_url).group(1) - chapter_list_link=urljoin(self.home_url, f"/chapters/{self.novel_id}/") - soup=self.get_soup(chapter_list_link) + chapter_list_link = urljoin(self.home_url, f"/chapters/{self.novel_id}/") + soup = self.get_soup(chapter_list_link) script = soup.find( - lambda tag: isinstance(tag, Tag) and tag.name == "script" and tag.text.startswith("window.__DATA__") @@ -122,8 +121,7 @@ def parse_chapter_list( futures.append(f) page_soups += [f.result() for f in futures] - - _i=0 + _i = 0 for soup in reversed(page_soups): script = soup.find( lambda tag: isinstance(tag, Tag) @@ -134,13 +132,13 @@ def parse_chapter_list( data = js2py.eval_js(script.text).to_dict() assert isinstance(data, dict) - + for chapter in reversed(data["chapters"]): - _i+=1 + _i += 1 yield Chapter( - id = _i, - title = chapter["title"], - url = self.absolute_url(chapter["link"]), + id=_i, + title=chapter["title"], + url=self.absolute_url(chapter["link"]), ) def visit_chapter_page_in_browser(self, chapter: Chapter) -> None: @@ -149,4 +147,3 @@ def visit_chapter_page_in_browser(self, chapter: Chapter) -> None: def select_chapter_body(self, soup: BeautifulSoup) -> Tag: return soup.select_one("div#arrticle") -