Merge pull request #2397 from CxRxExO/master

Add source
dipu-bd · Jun 14, 2024 · 3011fa2 · 3011fa2
2 parents 774a024 + e2c27be
commit 3011fa2
Showing 1 changed file with 152 additions and 0 deletions.
diff --git a/sources/es/novelasligeras.py b/sources/es/novelasligeras.py
@@ -0,0 +1,152 @@
+# -*- coding: utf-8 -*-
+import logging
+import re
+from typing import List
+from urllib.parse import urlparse
+
+from lncrawl.core.crawler import Crawler
+from lncrawl.models import Chapter, SearchResult, Volume
+
+logger = logging.getLogger(__name__)
+search_url = (
+    "https://novelasligeras.net/?post_type=product&title=1&excerpt=1&content=0&categories=1&attributes=1"
+    "&tags=1&sku=0&orderby=title-DESC&ixwps=1&s=%s"
+)
+
+
+class NovelasLigerasCrawler(Crawler):
+    base_url = ["https://novelasligeras.net/"]
+    has_manga = False
+    has_mtl = False
+
+    def initialize(self) -> None:
+        self.cleaner.bad_text_regex.update(["Publicidad"])
+        self.cleaner.bad_css.update(["div[style]"])
+
+    def login(self, email: str, password: str) -> None:
+        # TODO optimize login headers
+        header = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0",
+            "Content-Type": "application/x-www-form-urlencoded",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+            "Accept-Language": "es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "Sec-Fetch-Dest": "document",
+            "Sec-Fetch-Mode": "navigate",
+            "Sec-Fetch-Site": "same-origin",
+            "Sec-Fetch-User": "?1",
+            "TE": "Trailers",
+            "Referer": "https://novelasligeras.net/index.php/suscripcion-ingresar/",
+        }
+        data = {
+            "log": email,
+            "pwd": password,
+            "wp-submit": "Acceder",
+            "redirect_to": "https://novelasligeras.net/index.php/suscripcion-cuenta-v2/",
+            "mepr_process_login_form": "true",
+            "mepr_is_login_page": "true",
+            "testcookie": "1",
+        }
+        self.post_response(self.base_url[0], data=data, headers=header)
+
+    def search_novel(self, query) -> List[SearchResult]:
+        query = query.lower().replace(" ", "+")
+        soup = self.get_soup(search_url % query)
+
+        results = []
+        for tab in soup.select(".wf-cell[data-post-id]"):
+            title = tab.attrs["data-name"]
+            rating_element = tab.select_one(".star-rating")
+            rating = "N/A"
+            if rating_element:
+                rating = rating_element.attrs["aria-label"]
+            url_element = tab.select_one(".alignnone")
+            url = url_element.attrs["href"]
+            results.append(
+                SearchResult(
+                    title=title.strip(),
+                    url=self.absolute_url(url),
+                    info="Clasificación: %s" % rating,
+                )
+            )
+
+        return results
+
+    def read_novel_info(self) -> None:
+        logger.debug("Visiting %s", self.novel_url)
+        soup = self.get_soup(self.novel_url)
+
+        possible_title = soup.select_one("h1.product_title")
+        assert possible_title, "Sin título"
+        self.novel_title = possible_title.text.strip()
+
+        possible_author = soup.select_one(
+            'tr.woocommerce-product-attributes-item--attribute_pa_escritor a[rel="tag"]'
+        )
+        if possible_author:
+            self.novel_author = possible_author.text.strip()
+
+        possible_cover = soup.select_one('meta[property="og:image"]')
+        if possible_cover:
+            self.novel_cover = self.absolute_url(possible_cover["content"])
+
+        synopsis = soup.select_one(".woocommerce-product-details__short-description")
+        if synopsis:
+            self.novel_synopsis = synopsis.text
+
+        hostname = urlparse(self.novel_url).hostname or ""
+        pattern = re.escape(hostname) + "/index.php" + r"/\d{4}/\d{2}/\d{2}/"
+
+        volume_pattern = r"-volumen-(\d+)-"
+
+        logger.debug("pattern = %s", pattern)
+
+        last_vol_id = 0
+        chapters_count = 0
+
+        for a in soup.select(
+            ".wpb_wrapper a:not([id],[title],[href$='suscripciones/'],[href*='patreon'],[href*='paypal'])"
+        ):
+            if not re.search(pattern, a["href"]):
+                continue
+            chapters_count += 1
+            chap_id = chapters_count
+
+            match = re.search(volume_pattern, a["href"])
+            if match:
+                vol_id = int(match.group(1))
+                last_vol_id = vol_id
+            else:
+                vol_id = last_vol_id
+
+            vol_present = any(vol["id"] == vol_id for vol in self.volumes)
+            vol_title = f"Volumen {vol_id}"
+            if not vol_present:
+                self.volumes.append(Volume(id=vol_id, title=vol_title))
+
+            temp_title = a.text.strip()
+            temp_title = re.sub(r"\bCapitulo\b", "Capítulo", temp_title)
+
+            if "Parte" in temp_title and "Capítulo" in temp_title:
+                partes = temp_title.split(" – ")
+                title = " – ".join(partes[::-1])
+            else:
+                title = temp_title
+
+            self.chapters.append(
+                Chapter(
+                    id=chap_id,
+                    title=title,
+                    url=self.absolute_url(a["href"]),
+                    volume=vol_id,
+                    volume_title=vol_title,
+                )
+            )
+
+    def download_chapter_body(self, chapter):
+        soup = self.get_soup(chapter["url"])
+        if soup.select_one(".wpb_text_column > div:nth-child(1)"):
+            text = soup.select_one(".wpb_text_column > div:nth-child(1)")
+            return self.cleaner.extract_contents(text)
+        return "--Error al cargar el capítulo--"