Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update sources and fix an issue in crawler template #1989

Merged
merged 26 commits into from
Jul 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
b5d8c36
typo
jere344 Jun 11, 2023
d61e2ea
Merge pull request #1977 from jere344/patch-1
dipu-bd Jun 11, 2023
8e54e7b
Update 1stkissnovel.py domain has changed from .love to ,org
Anuj2976 Jun 16, 2023
659cbed
Update 1stkissnovel.py
dipu-bd Jun 16, 2023
20a9048
Update 1stkissnovel.py
dipu-bd Jun 16, 2023
e215214
Merge pull request #1978 from Anuj2976/patch-1
dipu-bd Jun 16, 2023
5480cf7
Generate source index
dipu-bd Jun 16, 2023
de3fcd9
Fix exiledrebels.py
jere344 Jun 20, 2023
ce1bdcd
Merge pull request #1980 from jere344/patch-2
dipu-bd Jun 20, 2023
17ef3e0
Generate source index
dipu-bd Jun 20, 2023
5560b37
Browser for https://novelsonline.net
zerty Jun 29, 2023
8297150
Fix chireads
jere344 Jun 29, 2023
62f12ba
Fix for lightnovelpub.com
zerty Jun 30, 2023
b871b4a
Small errors on Browser volume templates templ
zerty Jun 30, 2023
b6e2377
fix for novelsemperor.com
zerty Jun 30, 2023
18af372
fix for lightnovelstranslations.com/
zerty Jun 30, 2023
83fc3b7
Merge pull request #1988 from zerty/lightnovetrans
dipu-bd Jul 1, 2023
3fe82c8
Generate source index
dipu-bd Jul 1, 2023
79ec3da
Merge pull request #1986 from zerty/novelpub
dipu-bd Jul 1, 2023
1a82489
Merge pull request #1987 from zerty/novelemperor
dipu-bd Jul 1, 2023
16aa871
Update chireads.py
dipu-bd Jul 1, 2023
1145b9e
Merge pull request #1985 from jere344/patch-3
dipu-bd Jul 1, 2023
ce517d7
Merge pull request #1984 from zerty/novelonlinenet
dipu-bd Jul 1, 2023
2ae9f87
Generate source index
dipu-bd Jul 1, 2023
eb1fa2d
Update VERSION
dipu-bd Jul 1, 2023
60b9750
Generate source index
dipu-bd Jul 1, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/contribs.json
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,7 @@
"Yoga Setiawan": null,
"yogainformatika@gmail.com": null,
"dev ops": null,
"ismaelcomsci@gmail.com": null
"ismaelcomsci@gmail.com": null,
"Anuj2976": null,
"akakanuj@gmail.com": null
}
683 changes: 344 additions & 339 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lncrawl/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.2.6
3.2.7
4 changes: 2 additions & 2 deletions lncrawl/templates/browser/optional_volume.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ def parse_volume_item_in_browser(self, tag: Tag, id: int) -> Volume:

def select_chapter_tags_in_browser(self, tag: Tag) -> Generator[Tag, None, None]:
"""Select chapter list item tags from volume tag from the browser"""
raise self.select_chapter_tags(tag)
return self.select_chapter_tags(tag)

def parse_chapter_item_in_browser(self, tag: Tag, id: int, vol: Volume) -> Chapter:
"""Parse a single chapter from chapter list item tag from the browser"""
raise self.parse_chapter_item(tag, id, vol)
return self.parse_chapter_item(tag, id, vol)
4 changes: 2 additions & 2 deletions lncrawl/templates/browser/with_volume.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ def select_chapter_tags_in_browser(
self, tag: Tag, vol: Volume
) -> Generator[Tag, None, None]:
"""Select chapter list item tags from volume tag from the browser"""
raise self.select_chapter_tags(tag, vol)
return self.select_chapter_tags(tag, vol)

def parse_chapter_item_in_browser(self, tag: Tag, id: int, vol: Volume) -> Chapter:
"""Parse a single chapter from chapter list item tag from the browser"""
raise self.parse_chapter_item(tag, id, vol)
return self.parse_chapter_item(tag, id, vol)
8 changes: 7 additions & 1 deletion lncrawl/templates/mangastream.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ def parse_title_in_browser(self) -> str:
return self.parse_title(self.browser.soup)

def parse_cover(self, soup: BeautifulSoup) -> str:
tag = soup.select_one(".thumbook img, meta[property='og:image']")
tag = soup.select_one(
".thumbook img, meta[property='og:image'],.sertothumb img"
)
if tag.has_attr("data-src"):
return self.absolute_url(tag["data-src"])

Expand Down Expand Up @@ -84,3 +86,7 @@ def parse_chapter_item(self, tag: Tag, id: int, vol: Volume) -> Chapter:

def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
return soup.select_one("#readernovel, #readerarea, .entry-content")

def visit_chapter_page_in_browser(self, chapter: Chapter) -> None:
self.visit(chapter.url)
self.browser.wait("#readernovel, #readerarea, .entry-content,.mainholder")
2 changes: 1 addition & 1 deletion lncrawl/templates/novelpub.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

logger = logging.getLogger(__name__)

digit_regex = re.compile(r"page-(\d+)$")
digit_regex = re.compile(r"page[-,=](\d+)")


class NovelPubTemplate(SearchableBrowserTemplate, ChapterOnlyBrowserTemplate):
Expand Down
2 changes: 1 addition & 1 deletion lncrawl/templates/novelupdates.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

automation_warning = """
<div style="opacity: 0.5; padding: 14px; text-align: center; border: 1px solid #000; font-style: italic; font-size: 0.825rem">
Parsed with an automated reader. The content accuracy is not guranteed.
Parsed with an automated reader. The content accuracy is not guaranteed.
</div>
""".strip()

Expand Down
2 changes: 1 addition & 1 deletion sources/_index.json

Large diffs are not rendered by default.

26 changes: 6 additions & 20 deletions sources/en/1/1stkissnovel.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,23 @@

logger = logging.getLogger(__name__)
search_url = (
"https://1stkissnovel.love/?s=%s&post_type=wp-manga&author=&artist=&release="
"%s?s=%s&post_type=wp-manga&author=&artist=&release="
)
wp_admin_ajax_url = "https://1stkissnovel.love/wp-admin/admin-ajax.php"


class OneKissNovelCrawler(Crawler):
has_mtl = True
base_url = "https://1stkissnovel.love/"
base_url = [
"https://1stkissnovel.org/",
"https://1stkissnovel.love/",
]

def initialize(self) -> None:
self.cleaner.bad_tags.update(["h3"])

def search_novel(self, query):
query = query.lower().replace(" ", "+")
soup = self.get_soup(search_url % query)
soup = self.get_soup(search_url % (self.home_url, query))

results = []
for tab in soup.select(".c-tabs-item__content"):
Expand All @@ -34,7 +36,6 @@ def search_novel(self, query):
"info": "%s | Rating: %s" % (latest, votes),
}
)

return results

def read_novel_info(self):
Expand All @@ -48,10 +49,8 @@ def read_novel_info(self):
logger.info("Novel title: %s", self.novel_title)

img_src = soup.select_one(".summary_image a img")

if img_src:
self.novel_cover = self.absolute_url(img_src["data-src"])

logger.info("Novel cover: %s", self.novel_cover)

self.novel_author = " ".join(
Expand All @@ -65,18 +64,6 @@ def read_novel_info(self):
self.novel_id = soup.select_one("#manga-chapters-holder")["data-id"]
logger.info("Novel id: %s", self.novel_id)

# For getting cookies
# self.submit_form(wp_admin_ajax_url, data={
# 'action': 'manga_views',
# 'manga': self.novel_id,
# })

# Deprecated way to fetch chapters
# response = self.submit_form(wp_admin_ajax_url, data={
# 'action': 'manga_get_chapters',
# 'manga': self.novel_id,
# })

clean_novel_url = self.novel_url.split("?")[0].strip("/")
response = self.submit_form(f"{clean_novel_url}/ajax/chapters/")

Expand All @@ -96,7 +83,6 @@ def read_novel_info(self):
)

def download_chapter_body(self, chapter):
logger.info("Visiting %s", chapter["url"])
soup = self.get_soup(chapter["url"])
contents = soup.select_one("div.text-left")
return self.cleaner.extract_contents(contents)
2 changes: 1 addition & 1 deletion sources/en/e/exiledrebels.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,5 @@ def read_novel_info(self):

def download_chapter_body(self, chapter):
soup = self.get_soup(chapter["url"])
contents = soup.select("div#wtr-content")
contents = soup.select_one("div#wtr-content")
return self.cleaner.extract_contents(contents)
92 changes: 46 additions & 46 deletions sources/en/l/lightnovetrans.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,52 @@
# -*- coding: utf-8 -*-

import logging
from lncrawl.core.crawler import Crawler
from typing import Generator, Union

from bs4 import BeautifulSoup, Tag

from lncrawl.models import Chapter, Volume
from lncrawl.templates.soup.general import GeneralSoupTemplate

logger = logging.getLogger(__name__)


class LNTCrawler(Crawler):
base_url = 'https://lightnovelstranslations.com/'

def read_novel_info(self):
soup = self.get_soup(self.novel_url)

possible_title = soup.select_one('h1.entry-title')
assert possible_title, 'No novel title'
self.novel_title = possible_title.text

possible_cover = soup.select_one('meta[property="og:image"]')
if possible_cover:
self.novel_cover = self.absolute_url(possible_cover['content'])

for p in soup.select('.entry-content > p'):
if 'Author' in p.text:
self.novel_author = p.text.replace('Author:', '').strip()
break

for div in soup.select('.entry-content .su-spoiler'):
vol = div.select_one('.su-spoiler-title').text.strip()
vol_id = int(vol) if vol.isdigit() else len(self.volumes) + 1
self.volumes.append({
'id': vol_id,
'title': vol,
})
for a in div.select('.su-spoiler-content p a'):
if not a.has_attr('href'):
continue
self.chapters.append({
'id': len(self.chapters) + 1,
'volume': vol_id,
'title': a.text.strip(),
'url': self.absolute_url(a['href']),
})

def download_chapter_body(self, chapter):
logger.info('Visiting: %s', chapter['url'])
soup = self.get_soup(chapter['url'])

content = soup.select_one('.entry-content')
for bad in content.select('.alignleft, .alignright, hr, p[style*="text-align: center"]'):
bad.extract()

return '\n'.join([str(p) for p in content.find_all('p')])
class LNTCrawler(GeneralSoupTemplate):
base_url = ["https://lightnovelstranslations.com/"]

has_manga = False
has_mtl = False

def get_novel_soup(self) -> BeautifulSoup:
return self.get_soup(f"{self.novel_url}/?tab=table_contents")

def parse_title(self, soup: BeautifulSoup) -> str:
tag = soup.select_one(".novel_title")
assert tag
return tag.text.strip()

def parse_cover(self, soup: BeautifulSoup) -> str:
tag = soup.select_one(".novel-image img")
assert tag
if tag.has_attr("data-src"):
return self.absolute_url(tag["data-src"])
if tag.has_attr("src"):
return self.absolute_url(tag["src"])

def parse_authors(self, soup: BeautifulSoup) -> Generator[str, None, None]:
for p in soup.select(".entry-content > p"):
if "Author" in p.text:
yield p.text.replace("Author:", "").strip()

def parse_chapter_list(
self, soup: BeautifulSoup
) -> Generator[Union[Chapter, Volume], None, None]:
_id = 0
for a in soup.select(".novel_list_chapter_content li.unlock a"):
_id += 1
yield Chapter(
id=_id, url=self.absolute_url(a["href"]), title=a.text.strip()
)

def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
return soup.select_one(".text_story")
122 changes: 62 additions & 60 deletions sources/en/n/novelsonline.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,78 @@
# -*- coding: utf-8 -*-

import logging
import re
from lncrawl.core.crawler import Crawler
from typing import Generator, Union

logger = logging.getLogger(__name__)
search_url = "https://novelsonline.net/search/autocomplete"
from bs4 import BeautifulSoup, Tag

from lncrawl.models import Chapter, Volume
from lncrawl.templates.browser.general import GeneralBrowserTemplate

class NovelsOnline(Crawler):
base_url = "https://novelsonline.net/"
logger = logging.getLogger(__name__)

def read_novel_info(self):
logger.debug("Visiting %s", self.novel_url)
soup = self.get_soup(self.novel_url)

possible_title = soup.select_one(".block-title h1")
assert possible_title, "No novel title"
self.novel_title = possible_title.text
logger.info("Novel title: %s", self.novel_title)
class NovelsOnline(GeneralBrowserTemplate):
base_url = ["https://novelsonline.net/"]
has_manga = False
has_mtl = False

self.novel_cover = self.absolute_url(
soup.find("img", {"alt": self.novel_title})["src"]
# TODO: [OPTIONAL] This is called before all other methods.
def initialize(self) -> None:
self.cleaner.bad_tags.update(["div"])
self.cleaner.bad_css.update(
[
".trinity-player-iframe-wrapper",
".hidden",
".ads-title",
"script",
"center",
"interaction",
"a[href*=remove-ads]",
"a[target=_blank]",
"hr",
"br",
"#growfoodsmart",
".col-md-6",
".trv_player_container",
".ad1",
]
)
logger.info("Novel cover: %s", self.novel_cover)

author_link = soup.select_one("a[href*=author]")
if author_link:
self.novel_author = author_link.text.strip().title()
logger.info("Novel author: %s", self.novel_author)

volume_ids = set()
for a in soup.select(".chapters .chapter-chs li a"):
chap_id = len(self.chapters) + 1
vol_id = (chap_id - 1) // 100 + 1
volume_ids.add(vol_id)
self.chapters.append(
{
"id": chap_id,
"volume": vol_id,
"url": self.absolute_url(a["href"]),
"title": a.text.strip() or ("Chapter %d" % chap_id),
}
)
# TODO: [OPTIONAL] Open the Novel URL in the browser
def visit_novel_page_in_browser(self) -> BeautifulSoup:
self.visit(self.novel_url)
self.browser.wait(".container--content")

self.volumes = [{"id": i} for i in volume_ids]
def parse_title(self, soup: BeautifulSoup) -> str:
tag = soup.select_one(".block-title h1")
assert tag
return tag.text.strip()

def download_chapter_body(self, chapter):
soup = self.get_soup(chapter["url"])
def parse_cover(self, soup: BeautifulSoup) -> str:
tag = soup.find("img", {"alt": self.novel_title})
assert tag
if tag.has_attr("data-src"):
return self.absolute_url(tag["data-src"])
elif tag.has_attr("src"):
return self.absolute_url(tag["src"])

div = soup.select_one(".chapter-content3")
def parse_authors(self, soup: BeautifulSoup) -> Generator[str, None, None]:
for a in soup.select("a[href*=author]"):
yield a.text.strip()

bad_selectors = [
".trinity-player-iframe-wrapper" ".hidden",
".ads-title",
"script",
"center",
"interaction",
"a[href*=remove-ads]",
"a[target=_blank]",
"hr",
"br",
"#growfoodsmart",
".col-md-6",
]
for hidden in div.select(", ".join(bad_selectors)):
hidden.extract()
def parse_chapter_list(
self, soup: BeautifulSoup
) -> Generator[Union[Chapter, Volume], None, None]:
_id = 0
for a in soup.select(".chapters .chapter-chs li a"):
_id += 1
yield Chapter(
id=_id, url=self.absolute_url(a["href"]), title=a.text.strip()
)

body = self.cleaner.extract_contents(div)
if re.search(r"c?hapter .?\d+", body[0], re.IGNORECASE):
title = body[0].replace("<strong>", "").replace("</strong>", "").strip()
title = ("C" if title.startswith("hapter") else "") + title
chapter["title"] = title.strip()
body = body[1:]
def visit_chapter_page_in_browser(self, chapter: Chapter) -> None:
self.visit(chapter.url)
self.browser.wait(".container--content")

return "<p>" + "</p><p>".join(body) + "</p>"
def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
return soup.select_one("#contentall")
Loading