From 790b1336a621ed5ef6f60fe08bdd149cca3e1b69 Mon Sep 17 00:00:00 2001 From: Leonardo Taccari Date: Sun, 24 Mar 2019 12:15:00 +0100 Subject: [PATCH] [instagram] Add support for hashtags Add support for hashtags (TagPage-s), i.e. explore/tags/ URLs. This also introduce a get_metadata() method in order to append possible further metadata per-(sub)extractor. Refactor and generalize _extract_profilepage() to _extract_page() in order to be reused by _extract_profilepage() and _extract_tagpage() simply by passing the type of page (`ProfilePage' or `TagPage') and picking up the respective fields in shared data. --- gallery_dl/extractor/instagram.py | 77 ++++++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 12 deletions(-) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index e7102473e6..4b8ecfcd5b 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -22,10 +22,15 @@ class InstagramExtractor(Extractor): archive_fmt = "{media_id}" root = "https://www.instagram.com" + def get_metadata(self): + return {} + def items(self): yield Message.Version, 1 + metadata = self.get_metadata() for data in self.instagrams(): + data.update(metadata) yield Message.Directory, data if data['typename'] == 'GraphImage': @@ -87,25 +92,43 @@ def _extract_postpage(self, url): return medias - def _extract_profilepage(self, url): + def _extract_page(self, url, page_type): + shared_data_fields = { + 'ProfilePage': { + 'node': 'user', + 'node_id': 'id', + 'edge_to_medias': 'edge_owner_to_timeline_media', + 'variables_id': 'id', + 'query_hash': '66eb9403e44cc12e5b5ecda48b667d41', + }, + 'TagPage': { + 'node': 'hashtag', + 'node_id': 'name', + 'edge_to_medias': 'edge_hashtag_to_media', + 'variables_id': 'tag_name', + 'query_hash': 'f92f56d47dc7a55b606908374b43a314', + }, + } + page = self.request(url).text shared_data = self._extract_shared_data(page) + psdf = shared_data_fields[page_type] while True: - # Deal with different structure of profile pages: the first page + # Deal with different structure of pages: the first page # has interesting data in `entry_data', next pages in `data'. if 'entry_data' in shared_data: - base_shared_data = shared_data['entry_data']['ProfilePage'][0]['graphql'] + base_shared_data = shared_data['entry_data'][page_type][0]['graphql'] - # `rhx_gis' and `user_id' are available only in the first page + # `rhx_gis' and variables_id are available only in the first page rhx_gis = shared_data['rhx_gis'] - user_id = base_shared_data['user']['id'] + variables_id = base_shared_data[psdf['node']][psdf['node_id']] else: base_shared_data = shared_data['data'] - timeline = base_shared_data['user']['edge_owner_to_timeline_media'] - has_next_page = timeline['page_info']['has_next_page'] - shortcodes = [n['node']['shortcode'] for n in timeline['edges']] + medias = base_shared_data[psdf['node']][psdf['edge_to_medias']] + has_next_page = medias['page_info']['has_next_page'] + shortcodes = [n['node']['shortcode'] for n in medias['edges']] for s in shortcodes: url = '{}/p/{}/'.format(self.root, s) @@ -114,9 +137,10 @@ def _extract_profilepage(self, url): if not has_next_page: break - end_cursor = timeline['page_info']['end_cursor'] - variables = '{{"id":"{}","first":12,"after":"{}"}}'.format( - user_id, + end_cursor = medias['page_info']['end_cursor'] + variables = '{{"{}":"{}","first":12,"after":"{}"}}'.format( + psdf['variables_id'], + variables_id, end_cursor, ) xigis = '{}:{}'.format(rhx_gis, variables) @@ -126,11 +150,17 @@ def _extract_profilepage(self, url): } url = '{}/graphql/query/?query_hash={}&variables={}'.format( self.root, - '66eb9403e44cc12e5b5ecda48b667d41', + psdf['query_hash'], variables, ) shared_data = self.request(url, headers=headers).json() + def _extract_profilepage(self, url): + yield from self._extract_page(url, 'ProfilePage') + + def _extract_tagpage(self, url): + yield from self._extract_page(url, 'TagPage') + class InstagramImageExtractor(InstagramExtractor): """Extractor for PostPage""" @@ -219,3 +249,26 @@ def __init__(self, match): def instagrams(self): url = '{}/{}/'.format(self.root, self.username) return self._extract_profilepage(url) + + +class InstagramTagExtractor(InstagramExtractor): + """Extractor for TagPage""" + subcategory = "tag" + directory_fmt = ("{category}", "{subcategory}", "{tag}") + pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" + r"/explore/tags/([^/?&#]+)") + test = ("https://www.instagram.com/explore/tags/instagram/", { + "range": "1-12", + "count": ">= 12", + }) + + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.tag = match.group(1) + + def get_metadata(self): + return {"tag": self.tag} + + def instagrams(self): + url = '{}/explore/tags/{}/'.format(self.root, self.tag) + return self._extract_tagpage(url)