From 1faec285d1974210e7d63baee1bf5af9f84f3e7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 27 Sep 2019 23:14:32 +0200 Subject: [PATCH] [nijie] further improvements (closes #423) - provide a 'user_name' metadata field - usually the same as 'artist_id', except for favorite downloads - extract the whole description text and properly escape HTML entities - fixed an issue with titles or tags containing double quotes --- gallery_dl/extractor/nijie.py | 49 ++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 6963ce2a98..fdfad87af4 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -30,12 +30,12 @@ class NijieExtractor(AsynchronousMixin, Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.user_id = match.group(1) + self.user_id = text.parse_int(match.group(1)) + self.user_name = None self.session.headers["Referer"] = self.root + "/" def items(self): self.login() - metadata = self.metadata() yield Message.Version, 1 for image_id in self.image_ids(): @@ -46,7 +46,6 @@ def items(self): page = response.text data = self._extract_data(page) - data.update(metadata) data["image_id"] = text.parse_int(image_id) yield Message.Directory, data @@ -56,24 +55,19 @@ def items(self): image["extension"] = "jpg" yield Message.Url, image["url"], image - def metadata(self): - """Collect metadata for extractor-job""" - return {"user_id": text.parse_int(self.user_id)} - def image_ids(self): """Collect all relevant image-ids""" - return () @staticmethod def _extract_data(page): """Extract image metadata from 'page'""" extr = text.extract_from(page) keywords = text.unescape(extr( - 'name="keywords" content="', '"')).split(",") - return { + 'name="keywords" content="', '" />')).split(",") + data = { "title" : keywords[0].strip(), "description": text.unescape(extr( - '"og:description" content="', '"')), + '"description": "', '"').replace("&", "&")), "date" : text.parse_datetime(extr( '"datePublished": "', '"')[:-4] + "+0900", "%a %d %b %Y %I:%M:%S %p%z"), @@ -82,6 +76,9 @@ def _extract_data(page): "artist_name": keywords[1], "tags" : keywords[2:-1], } + data["user_id"] = data["artist_id"] + data["user_name"] = data["artist_name"] + return data @staticmethod def _extract_images(page): @@ -118,6 +115,10 @@ def _pagination(self, path): while True: page = self.request(url, params=params, notfound="artist").text + + if not self.user_name: + self.user_name = text.unescape(text.extract( + page, '
', '<')[0] or "") yield from text.extract_iter(page, 'illust_id="', '"') if '