From f6ac8cd49526c38616ef7d979050635cf31015f3 Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Tue, 7 Apr 2020 22:05:09 +0200 Subject: [PATCH 01/19] [nebula] Add basic support for Nebula (refs #21258) --- AUTHORS | 1 + docs/supportedsites.md | 1 + youtube_dl/extractor/extractors.py | 3 +- youtube_dl/extractor/nebula.py | 132 +++++++++++++++++++++++++++++ 4 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 youtube_dl/extractor/nebula.py diff --git a/AUTHORS b/AUTHORS index b507cb8dfab..64ac712494e 100644 --- a/AUTHORS +++ b/AUTHORS @@ -246,3 +246,4 @@ Enes Solak Nathan Rossi Thomas van der Berg Luca Cherubin +Henrik Heimbuerger diff --git a/docs/supportedsites.md b/docs/supportedsites.md index aa8026a322e..a4372912e36 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -569,6 +569,7 @@ - **ndr:embed** - **ndr:embed:base** - **NDTV** + - **Nebula** - **NerdCubedFeed** - **netease:album**: 网易云音乐 - 专辑 - **netease:djradio**: 网易云音乐 - 电台 diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 29b0e615ed1..9874441d5a6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -731,8 +731,9 @@ NJoyEmbedIE, ) from .ndtv import NDTVIE -from .netzkino import NetzkinoIE +from .nebula import NebulaIE from .nerdcubed import NerdCubedFeedIE +from .netzkino import NetzkinoIE from .neteasemusic import ( NetEaseMusicIE, NetEaseMusicAlbumIE, diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py new file mode 100644 index 00000000000..e22a4b08886 --- /dev/null +++ b/youtube_dl/extractor/nebula.py @@ -0,0 +1,132 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os + +from .common import InfoExtractor +from ..utils import parse_iso8601 + +COOKIE_NEBULA_AUTH = os.environ.get('COOKIE_NEBULA_AUTH') # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests + + +class NebulaIE(InfoExtractor): + """ + Nebula (https://watchnebula.com/) is a video platform created by the streamer community Standard. It hosts videos + off-YouTube from a small hand-picked group of creators. + + All videos require a subscription to watch. There are no known freely available videos. So the test case is + disabled (but should pass when supplying a 'nebula-auth' cookie for an account with a valid subscription). + + Nebula uses the Zype video infrastructure and this extractor is using the 'url_transparent' mode to hand off + video extraction to the Zype extractor. + + This description has been last updated on 2020-04-07. + """ + + _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P[-\w]+)' # the 'id' group is actually the slug, but we misname it 'id' to be able to use _match_id() + _TEST = { + 'url': 'https://watchnebula.com/videos/that-time-disney-remade-beauty-and-the-beast', + 'md5': 'fe79c4df8b3aa2fea98a93d027465c7e', + 'info_dict': { + 'id': '5c271b40b13fd613090034fd', + 'ext': 'mp4', + 'title': 'That Time Disney Remade Beauty and the Beast', + 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', + 'upload_date': '20180731', + 'timestamp': 1533009600, + #'uploader': 'Lindsay Ellis', # TODO: removed because unreliable/sometimes incorrect + } + } + _WORKING = False # this is set to False because the test won't pass without an auth cookie for a (paid) subscription + + def _extract_state_object(self, webpage, display_id): + """ + As of 2020-04-07, every Nebula video page is a React base page, containing an initial state JSON in a script + tag. This function is extracting this script tag, parsing it as JSON. + """ + initial_state_object = self._search_regex(r'', webpage, 'initial_state') + metadata = self._parse_json(initial_state_object, video_id=display_id) # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead? + + return metadata + + def _extract_video_metadata(self, state_object, display_id): + """ + The state object contains a videos.byURL dictionary, which maps URL display IDs to video IDs. Using the + video ID, we can then extract a dictionary with various meta data about the video itself. + """ + video_id = state_object['videos']['byURL'][display_id] + video_meta = state_object['videos']['byID'][video_id] + + return video_id, video_meta + + def _extract_video_url(self, webpage, state_object, video_id): + """ + To get the embed URL of the actual video stream, we could reconstruct it from the video ID, but it seems a + bit more stable to extract the iframe source that links to the video. + """ + iframe = self._search_regex(r'', webpage, 'iframe', fatal=False) + video_url = self._search_regex(r'src="(.+?)"', iframe, 'iframe-src', fatal=False) if iframe else None + + # fallback: reconstruct using video ID and access token from state object + if not video_url: + access_token = state_object['account']['userInfo']['zypeAuthInfo']['accessToken'] + video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format(video_id=video_id, access_token=access_token) + + return video_url + + def _extract_uploader(self, video_meta): + """ + Nebula doesn't really seem to have the concept of an uploader internally, videos are often organized + more like a (TV) series than by uploader. But in the example case, Lindsay Ellis is the creator, so + I'll go with this for now. + """ + return video_meta['categories'][0]['value'][0] + + def _real_extract(self, url): + # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests + if COOKIE_NEBULA_AUTH: + self._set_cookie('watchnebula.com', 'nebula-auth', COOKIE_NEBULA_AUTH) + + # extract the video's display ID from the URL (we'll retrieve the video ID later) + display_id = self._match_id(url) + + # download the page + webpage = self._download_webpage(url, video_id=display_id) # TODO: what video ID do I supply, as I don't know it yet? _download_webpage doesn't accept a display_id instead... + + # extract the state object from the webpage, and then retrieve video meta data from it + state_object = self._extract_state_object(webpage, display_id) + video_id, video_meta = self._extract_video_metadata(state_object, display_id) + + # extract the video URL from the webpage + video_url = self._extract_video_url(webpage, state_object, video_id) + + return { + 'id': video_id, + 'display_id': display_id, + + # we're passing this video URL on to the 'Zype' extractor (that's the video infrastructure that Nebula is + # built on top of) and use the 'url_transparent' type to indicate that our meta data should be better than + # whatever the Zype extractor is able to identify + '_type': 'url_transparent', + 'ie_key': 'Zype', + 'url': video_url, + + # the meta data we were able to extract from Nebula + 'title': video_meta['title'], + 'description': video_meta['description'], + 'timestamp': parse_iso8601(video_meta['published_at']), + #'uploader': self._extract_uploader(video_meta), # TODO: removed because unreliable/sometimes incorrect + 'thumbnails': [ + { + 'id': tn['name'], # this appears to be null in all cases I've seen + 'url': tn['url'], + 'width': tn['width'], + 'height': tn['height'], + } for tn in video_meta['thumbnails'] + ], + 'duration': video_meta['duration'], + # TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from! + # TODO: channel + # TODO: channel_id + # TODO: channel_url + } From 469cae38cd58bba0f44300b2c42d1ac2c9ade0ec Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Thu, 16 Apr 2020 04:34:17 +0200 Subject: [PATCH 02/19] [nebula] Add additional test cases and improve cookie envvar handling --- youtube_dl/extractor/nebula.py | 61 +++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index e22a4b08886..6aa4e1da482 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -23,21 +23,52 @@ class NebulaIE(InfoExtractor): This description has been last updated on 2020-04-07. """ - _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P[-\w]+)' # the 'id' group is actually the slug, but we misname it 'id' to be able to use _match_id() - _TEST = { - 'url': 'https://watchnebula.com/videos/that-time-disney-remade-beauty-and-the-beast', - 'md5': 'fe79c4df8b3aa2fea98a93d027465c7e', - 'info_dict': { - 'id': '5c271b40b13fd613090034fd', - 'ext': 'mp4', - 'title': 'That Time Disney Remade Beauty and the Beast', - 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', - 'upload_date': '20180731', - 'timestamp': 1533009600, - #'uploader': 'Lindsay Ellis', # TODO: removed because unreliable/sometimes incorrect - } - } - _WORKING = False # this is set to False because the test won't pass without an auth cookie for a (paid) subscription + _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P[-\w]+)' # the 'id' group is actually the display_id, but we misname it 'id' to be able to use _match_id() + _TESTS = [ + { + 'url': 'https://watchnebula.com/videos/that-time-disney-remade-beauty-and-the-beast', + 'md5': 'fe79c4df8b3aa2fea98a93d027465c7e', + 'info_dict': { + 'id': '5c271b40b13fd613090034fd', + 'ext': 'mp4', + 'title': 'That Time Disney Remade Beauty and the Beast', + 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', + 'upload_date': '20180731', + 'timestamp': 1533009600, + 'channel': 'Lindsay Ellis', + 'uploader': 'Lindsay Ellis', + } + }, + { + 'url': 'https://watchnebula.com/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', + 'md5': 'b0b171504d67e2822179149ccd6787db', + 'info_dict': { + 'id': '5e7e78171aaf320001fbd6be', + 'ext': 'mp4', + 'title': 'Landing Craft - How The Allies Got Ashore', + 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', + 'upload_date': '20200327', + 'timestamp': 1585348140, + 'channel': 'The Logistics of D-Day', + 'uploader': 'The Logistics of D-Day', + } + }, + { + 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', + 'md5': '98e96346caa3b303fec4493c5d49dcb5', + 'info_dict': { + 'id': '5e779ebdd157bc0001d1c75a', + 'ext': 'mp4', + 'title': 'Episode 1: The Draw', + 'description': r're:^There’s free money on offer… if the players can all work together.', + 'upload_date': '20200323', + 'timestamp': 1584980400, + 'channel': 'Tom Scott Presents: Money', + 'uploader': 'Tom Scott Presents: Money', + } + }, + ] + _WORKING = True # FIXME: should this be set to False, to hide the tests from CI, given that the unit tests require an auth cookie of a (paid) subscription? def _extract_state_object(self, webpage, display_id): """ From 61cead323579fd8d15d39566dae708f4b53e4a76 Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Thu, 16 Apr 2020 04:35:05 +0200 Subject: [PATCH 03/19] [nebula] Add better channel title extraction (refs #21258) --- youtube_dl/extractor/nebula.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 6aa4e1da482..828ea1c6f46 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -105,13 +105,32 @@ def _extract_video_url(self, webpage, state_object, video_id): return video_url - def _extract_uploader(self, video_meta): + def _extract_channel(self, video_meta): """ - Nebula doesn't really seem to have the concept of an uploader internally, videos are often organized - more like a (TV) series than by uploader. But in the example case, Lindsay Ellis is the creator, so - I'll go with this for now. + Extract the channel title, by going through the list of categories and finding the first value of the + first category that has a value. + + I know this look like a terrible approach. But actually, it's just reproducing the behavior of the + React code the Nebula frontend uses (as of 2020-04-07): + + let channel; + if (video && video.categories && video.categories.length) { + const channelTitle = video.categories.map((category) => (category.value[0])) + .filter((title) => (!!title))[0]; + channel = getChannelByTitle(state, { title: channelTitle }); + } + + Basically, it finds the first (truthy) value in the category list and that's assumed to be the + channel title. And then the channel details (e.g. the URL) are looked up by title (!) (not by any + kind of ID) via an additional API call. + + TODO: Implement the API calls giving us the channel list, so that we can do the title lookup and then figure out the channel URL """ - return video_meta['categories'][0]['value'][0] + categories = video_meta['categories'] + for category in categories: + if category['value']: + return category['value'][0] + return None def _real_extract(self, url): # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests @@ -127,6 +146,7 @@ def _real_extract(self, url): # extract the state object from the webpage, and then retrieve video meta data from it state_object = self._extract_state_object(webpage, display_id) video_id, video_meta = self._extract_video_metadata(state_object, display_id) + channel_title = self._extract_channel(video_meta) # extract the video URL from the webpage video_url = self._extract_video_url(webpage, state_object, video_id) @@ -146,7 +166,6 @@ def _real_extract(self, url): 'title': video_meta['title'], 'description': video_meta['description'], 'timestamp': parse_iso8601(video_meta['published_at']), - #'uploader': self._extract_uploader(video_meta), # TODO: removed because unreliable/sometimes incorrect 'thumbnails': [ { 'id': tn['name'], # this appears to be null in all cases I've seen @@ -156,8 +175,9 @@ def _real_extract(self, url): } for tn in video_meta['thumbnails'] ], 'duration': video_meta['duration'], + 'channel': channel_title, + 'uploader': channel_title, # we chose here to declare the channel name as the 'uploader' -- that's certainly arguable, as sometimes it's more of a series # TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from! - # TODO: channel # TODO: channel_id # TODO: channel_url } From af3434b839b2aae4cdf8ae607b34caa3560c663a Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Sat, 18 Apr 2020 06:15:03 +0200 Subject: [PATCH 04/19] [nebula] Relax meta data lookups --- youtube_dl/extractor/nebula.py | 48 ++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 828ea1c6f46..03886334851 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -4,7 +4,8 @@ import os from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..compat import compat_str +from ..utils import parse_iso8601, try_get COOKIE_NEBULA_AUTH = os.environ.get('COOKIE_NEBULA_AUTH') # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests @@ -74,9 +75,13 @@ def _extract_state_object(self, webpage, display_id): """ As of 2020-04-07, every Nebula video page is a React base page, containing an initial state JSON in a script tag. This function is extracting this script tag, parsing it as JSON. + + May return None if no state object could be found or it didn't contain valid JSON. """ - initial_state_object = self._search_regex(r'', webpage, 'initial_state') - metadata = self._parse_json(initial_state_object, video_id=display_id) # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead? + initial_state_object = self._search_regex( + r']*id="initial-app-state"[^>]*>(.+?)', webpage, + 'initial_state', fatal=False, default=None) + metadata = self._parse_json(initial_state_object, video_id=display_id) if initial_state_object else None # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead? return metadata @@ -84,9 +89,12 @@ def _extract_video_metadata(self, state_object, display_id): """ The state object contains a videos.byURL dictionary, which maps URL display IDs to video IDs. Using the video ID, we can then extract a dictionary with various meta data about the video itself. + + May return (None, {}) if no state object was given or it didn't contain the expected lookup table or + meta data. """ - video_id = state_object['videos']['byURL'][display_id] - video_meta = state_object['videos']['byID'][video_id] + video_id = try_get(state_object, lambda x: x['videos']['byURL'][display_id], compat_str) + video_meta = try_get(state_object, lambda x: x['videos']['byID'][video_id], dict) or {} return video_id, video_meta @@ -100,8 +108,10 @@ def _extract_video_url(self, webpage, state_object, video_id): # fallback: reconstruct using video ID and access token from state object if not video_url: - access_token = state_object['account']['userInfo']['zypeAuthInfo']['accessToken'] - video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format(video_id=video_id, access_token=access_token) + access_token = try_get(state_object, lambda x: x['account']['userInfo']['zypeAuthInfo']['accessToken'], + compat_str) + video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format( + video_id=video_id, access_token=access_token) return video_url @@ -125,12 +135,13 @@ def _extract_channel(self, video_meta): kind of ID) via an additional API call. TODO: Implement the API calls giving us the channel list, so that we can do the title lookup and then figure out the channel URL + + May return None of no category list could be found or no category had a label ('value'). """ - categories = video_meta['categories'] + categories = video_meta.get('categories', []) if video_meta else [] for category in categories: - if category['value']: + if category.get('value'): # we're intentionally not using "'value' in category" here, because the expression is supposed to be falsy for empty lists in category['value'] as well! return category['value'][0] - return None def _real_extract(self, url): # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests @@ -163,18 +174,17 @@ def _real_extract(self, url): 'url': video_url, # the meta data we were able to extract from Nebula - 'title': video_meta['title'], - 'description': video_meta['description'], - 'timestamp': parse_iso8601(video_meta['published_at']), + 'title': video_meta.get('title'), + 'description': video_meta.get('description'), + 'timestamp': parse_iso8601(video_meta.get('published_at')), 'thumbnails': [ { - 'id': tn['name'], # this appears to be null in all cases I've seen + 'id': tn.get('name'), # this appears to be null in all cases I've seen 'url': tn['url'], - 'width': tn['width'], - 'height': tn['height'], - } for tn in video_meta['thumbnails'] - ], - 'duration': video_meta['duration'], + 'width': tn.get('width'), + 'height': tn.get('height'), + } for tn in video_meta.get('thumbnails', [])], + 'duration': video_meta.get('duration'), 'channel': channel_title, 'uploader': channel_title, # we chose here to declare the channel name as the 'uploader' -- that's certainly arguable, as sometimes it's more of a series # TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from! From 18582060c204dc0ff5cad4aa7a51e50b9b14b339 Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Mon, 11 May 2020 05:58:14 +0200 Subject: [PATCH 05/19] [nebula] Rewrite extractor to new frontend (refs #21258) --- youtube_dl/extractor/nebula.py | 129 ++++++++++++++++++++------------- 1 file changed, 77 insertions(+), 52 deletions(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 03886334851..9a6ddf6f34f 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -4,10 +4,8 @@ import os from .common import InfoExtractor -from ..compat import compat_str -from ..utils import parse_iso8601, try_get - -COOKIE_NEBULA_AUTH = os.environ.get('COOKIE_NEBULA_AUTH') # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests +from ..compat import compat_urllib_parse_unquote, compat_str +from ..utils import parse_iso8601, ExtractorError, try_get class NebulaIE(InfoExtractor): @@ -15,13 +13,13 @@ class NebulaIE(InfoExtractor): Nebula (https://watchnebula.com/) is a video platform created by the streamer community Standard. It hosts videos off-YouTube from a small hand-picked group of creators. - All videos require a subscription to watch. There are no known freely available videos. So the test case is - disabled (but should pass when supplying a 'nebula-auth' cookie for an account with a valid subscription). + All videos require a subscription to watch. There are no known freely available videos. An authentication token to + an account with a valid subscription can be specified in multiple ways. Nebula uses the Zype video infrastructure and this extractor is using the 'url_transparent' mode to hand off video extraction to the Zype extractor. - This description has been last updated on 2020-04-07. + This description has been last updated on 2020-05-11. """ _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P[-\w]+)' # the 'id' group is actually the display_id, but we misname it 'id' to be able to use _match_id() @@ -61,7 +59,7 @@ class NebulaIE(InfoExtractor): 'id': '5e779ebdd157bc0001d1c75a', 'ext': 'mp4', 'title': 'Episode 1: The Draw', - 'description': r're:^There’s free money on offer… if the players can all work together.', + 'description': r'contains:There’s free money on offer… if the players can all work together.', 'upload_date': '20200323', 'timestamp': 1584980400, 'channel': 'Tom Scott Presents: Money', @@ -71,49 +69,76 @@ class NebulaIE(InfoExtractor): ] _WORKING = True # FIXME: should this be set to False, to hide the tests from CI, given that the unit tests require an auth cookie of a (paid) subscription? - def _extract_state_object(self, webpage, display_id): + def _retrieve_nebula_auth(self, video_id): """ - As of 2020-04-07, every Nebula video page is a React base page, containing an initial state JSON in a script - tag. This function is extracting this script tag, parsing it as JSON. + Attempt to find a Nebula API token. Makes multiple attempts in the following order: + a) the --video-password command line argument + b) the --cookies supplied cookie jar + c) the NEBULA_TOKEN environment variable + If none of these are successful, an end user-intended error message is returned, listing some solutions. - May return None if no state object could be found or it didn't contain valid JSON. + # TODO: are these authentication methods, in this order, the best practice for youtube-dl? """ - initial_state_object = self._search_regex( - r']*id="initial-app-state"[^>]*>(.+?)', webpage, - 'initial_state', fatal=False, default=None) - metadata = self._parse_json(initial_state_object, video_id=display_id) if initial_state_object else None # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead? - - return metadata - - def _extract_video_metadata(self, state_object, display_id): + nebula_token = self._downloader.params.get('videopassword') + if not nebula_token: + # TODO: is there a helper to do all this cookie extraction? + nebula_cookies = self._get_cookies('https://watchnebula.com') + nebula_cookie = nebula_cookies.get('nebula-auth') + if nebula_cookie: + nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value) + nebula_token = self._parse_json(nebula_cookie_value, video_id).get('apiToken') + if not nebula_token and 'NEBULA_TOKEN' in os.environ: + nebula_token = os.environ.get('NEBULA_TOKEN') + if not nebula_token: + raise ExtractorError('Nebula requires an account with an active subscription. ' + 'You can supply a corresponding token by either ' + 'a) finding your nebula-auth cookie and then specifying it via --video-password, or ' + 'b) passing in a cookie jar containing a nebula-auth cookie via --cookies, or ' + 'c) setting the environment variable NEBULA_TOKEN.') + return nebula_token + + def _call_zype_api(self, path, params, video_id, api_key): """ - The state object contains a videos.byURL dictionary, which maps URL display IDs to video IDs. Using the - video ID, we can then extract a dictionary with various meta data about the video itself. - - May return (None, {}) if no state object was given or it didn't contain the expected lookup table or - meta data. + A helper for making calls to the Zype API. """ - video_id = try_get(state_object, lambda x: x['videos']['byURL'][display_id], compat_str) - video_meta = try_get(state_object, lambda x: x['videos']['byID'][video_id], dict) or {} + query = {'api_key': api_key, 'per_page': 1} + query.update(params) + return self._download_json('https://api.zype.com' + path, video_id, query=query) - return video_id, video_meta + def _fetch_zype_video_data(self, display_id, api_key): + """ + Fetch video meta data from the Zype API. + """ + response = self._call_zype_api('/videos', {'friendly_title': display_id}, display_id, api_key) + if 'response' not in response or len(response['response']) != 1: + raise ExtractorError('Unable to find video on Zype API') + return response['response'][0] - def _extract_video_url(self, webpage, state_object, video_id): + def _call_nebula_api(self, path, video_id, access_token): """ - To get the embed URL of the actual video stream, we could reconstruct it from the video ID, but it seems a - bit more stable to extract the iframe source that links to the video. + A helper for making calls to the Nebula API. """ - iframe = self._search_regex(r'', webpage, 'iframe', fatal=False) - video_url = self._search_regex(r'src="(.+?)"', iframe, 'iframe-src', fatal=False) if iframe else None + return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={ + 'Authorization': 'Token {access_token}'.format(access_token=access_token) + }) - # fallback: reconstruct using video ID and access token from state object - if not video_url: - access_token = try_get(state_object, lambda x: x['account']['userInfo']['zypeAuthInfo']['accessToken'], - compat_str) - video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format( - video_id=video_id, access_token=access_token) + def _fetch_zype_access_token(self, video_id, nebula_token): + """ + Requests a Zype access token from the Nebula API. + """ + user_object = self._call_nebula_api('/auth/user', video_id, nebula_token) + access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str) + if not access_token: + raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') + return access_token - return video_url + def _build_video_url(self, video_id, zype_access_token): + """ + Construct a Zype video URL (as supported by the Zype extractor), given a Zype video ID and a Zype access token. + """ + return 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format( + video_id=video_id, + access_token=zype_access_token) def _extract_channel(self, video_meta): """ @@ -144,23 +169,23 @@ def _extract_channel(self, video_meta): return category['value'][0] def _real_extract(self, url): - # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests - if COOKIE_NEBULA_AUTH: - self._set_cookie('watchnebula.com', 'nebula-auth', COOKIE_NEBULA_AUTH) - # extract the video's display ID from the URL (we'll retrieve the video ID later) display_id = self._match_id(url) - # download the page - webpage = self._download_webpage(url, video_id=display_id) # TODO: what video ID do I supply, as I don't know it yet? _download_webpage doesn't accept a display_id instead... + # retrieve Nebula authentication information + nebula_token = self._retrieve_nebula_auth(display_id) + + # fetch video meta data from the Nebula API + api_key = 'JlSv9XTImxelHi-eAHUVDy_NUM3uAtEogEpEdFoWHEOl9SKf5gl9pCHB1AYbY3QF' # FIXME: extract from main chunk at runtime + video_meta = self._fetch_zype_video_data(display_id, api_key) + video_id = video_meta['_id'] - # extract the state object from the webpage, and then retrieve video meta data from it - state_object = self._extract_state_object(webpage, display_id) - video_id, video_meta = self._extract_video_metadata(state_object, display_id) + # extract additional info channel_title = self._extract_channel(video_meta) - # extract the video URL from the webpage - video_url = self._extract_video_url(webpage, state_object, video_id) + # fetch the access token for Zype, then construct the video URL + zype_access_token = self._fetch_zype_access_token(video_id, nebula_token=nebula_token) + video_url = self._build_video_url(video_id, zype_access_token) return { 'id': video_id, @@ -179,7 +204,7 @@ def _real_extract(self, url): 'timestamp': parse_iso8601(video_meta.get('published_at')), 'thumbnails': [ { - 'id': tn.get('name'), # this appears to be null in all cases I've seen + 'id': tn.get('name'), # this appears to be null in all cases I've encountered 'url': tn['url'], 'width': tn.get('width'), 'height': tn.get('height'), From 1317a43a6a332db66c984e6c66708f697ee7961a Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Fri, 15 May 2020 06:14:08 +0200 Subject: [PATCH 06/19] [nebula] Implement Zype API key retrieval from JS chunk --- youtube_dl/extractor/nebula.py | 63 +++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 9a6ddf6f34f..0ce229ad5b0 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -5,7 +5,7 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote, compat_str -from ..utils import parse_iso8601, ExtractorError, try_get +from ..utils import parse_iso8601, ExtractorError, try_get, urljoin class NebulaIE(InfoExtractor): @@ -97,36 +97,81 @@ def _retrieve_nebula_auth(self, video_id): 'c) setting the environment variable NEBULA_TOKEN.') return nebula_token - def _call_zype_api(self, path, params, video_id, api_key): + def _retrieve_zype_api_key(self, page_url, display_id): + """ + Retrieves the Zype API key required to make calls to the Zype API. + + Unfortunately, the Nebula frontend stores this as a JS object literal in one of its JS chunks, + looking somewhat like this (but minified): + + return { + NODE_ENV: "production", + REACT_APP_NAME: "Nebula", + REACT_APP_NEBULA_API: "https://api.watchnebula.com/api/v1/", + REACT_APP_ZYPE_API: "https://api.zype.com/", + REACT_APP_ZYPE_API_KEY: "", + REACT_APP_ZYPE_APP_KEY: "", + // ... + } + + So we have to find the reference to the chunk in the video page (as it is hashed and the hash will + change when they do a new release), then download the chunk and extract the API key from there, + hoping they won't rename the constant. + + Alternatively, it is currently hardcoded and shared among all users. We haven't seen it + change so far, so we could also just hardcode it in the extractor as a fallback. + """ + # fetch the video page + webpage = self._download_webpage(page_url, video_id=display_id) + + # find the script tag with a file named 'main..chunk.js' in there + main_script_relpath = self._search_regex( + r']*src="(?P[^"]*main.[0-9a-f]*.chunk.js)"[^>]*>', webpage, + group='script_relpath', name='script relative path', fatal=True) + + # fetch the JS chunk + main_script_abspath = urljoin(page_url, main_script_relpath) + main_script = self._download_webpage(main_script_abspath, video_id=display_id, + note='Retrieving Zype API key') + + # find the API key named 'REACT_APP_ZYPE_API_KEY' in there + api_key = self._search_regex( + r'REACT_APP_ZYPE_API_KEY\s*:\s*"(?P[\w-]*)"', main_script, + group='api_key', name='API key', fatal=True) + + return api_key + + def _call_zype_api(self, path, params, video_id, api_key, note): """ A helper for making calls to the Zype API. """ query = {'api_key': api_key, 'per_page': 1} query.update(params) - return self._download_json('https://api.zype.com' + path, video_id, query=query) + return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note) def _fetch_zype_video_data(self, display_id, api_key): """ Fetch video meta data from the Zype API. """ - response = self._call_zype_api('/videos', {'friendly_title': display_id}, display_id, api_key) + response = self._call_zype_api('/videos', {'friendly_title': display_id}, + display_id, api_key, note='Retrieving metadata from Zype') if 'response' not in response or len(response['response']) != 1: raise ExtractorError('Unable to find video on Zype API') return response['response'][0] - def _call_nebula_api(self, path, video_id, access_token): + def _call_nebula_api(self, path, video_id, access_token, note): """ A helper for making calls to the Nebula API. """ return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={ 'Authorization': 'Token {access_token}'.format(access_token=access_token) - }) + }, note=note) def _fetch_zype_access_token(self, video_id, nebula_token): """ Requests a Zype access token from the Nebula API. """ - user_object = self._call_nebula_api('/auth/user', video_id, nebula_token) + user_object = self._call_nebula_api('/auth/user', video_id, nebula_token, note='Retrieving Zype access token') access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str) if not access_token: raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') @@ -176,7 +221,7 @@ def _real_extract(self, url): nebula_token = self._retrieve_nebula_auth(display_id) # fetch video meta data from the Nebula API - api_key = 'JlSv9XTImxelHi-eAHUVDy_NUM3uAtEogEpEdFoWHEOl9SKf5gl9pCHB1AYbY3QF' # FIXME: extract from main chunk at runtime + api_key = self._retrieve_zype_api_key(url, display_id) video_meta = self._fetch_zype_video_data(display_id, api_key) video_id = video_meta['_id'] @@ -184,7 +229,7 @@ def _real_extract(self, url): channel_title = self._extract_channel(video_meta) # fetch the access token for Zype, then construct the video URL - zype_access_token = self._fetch_zype_access_token(video_id, nebula_token=nebula_token) + zype_access_token = self._fetch_zype_access_token(display_id, nebula_token=nebula_token) video_url = self._build_video_url(video_id, zype_access_token) return { From 30362440dcbf35f2ccce3b5c24b4d4c5ee809c92 Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Tue, 2 Jun 2020 04:57:37 +0200 Subject: [PATCH 07/19] [nebula] Improve performance by avoiding redirect --- youtube_dl/extractor/nebula.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 0ce229ad5b0..9a2828e1913 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -171,7 +171,7 @@ def _fetch_zype_access_token(self, video_id, nebula_token): """ Requests a Zype access token from the Nebula API. """ - user_object = self._call_nebula_api('/auth/user', video_id, nebula_token, note='Retrieving Zype access token') + user_object = self._call_nebula_api('/auth/user/', video_id, nebula_token, note='Retrieving Zype access token') access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str) if not access_token: raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') From f8eb89748bd46e3f75d847c973453f6f7e5a618e Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Sat, 3 Oct 2020 05:41:27 +0200 Subject: [PATCH 08/19] [nebula] Update test video checksums --- youtube_dl/extractor/nebula.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 9a2828e1913..4b8cca8fd6a 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -40,7 +40,7 @@ class NebulaIE(InfoExtractor): }, { 'url': 'https://watchnebula.com/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', - 'md5': 'b0b171504d67e2822179149ccd6787db', + 'md5': '6d4edd14ce65720fa63aba5c583fb328', 'info_dict': { 'id': '5e7e78171aaf320001fbd6be', 'ext': 'mp4', @@ -54,7 +54,7 @@ class NebulaIE(InfoExtractor): }, { 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', - 'md5': '98e96346caa3b303fec4493c5d49dcb5', + 'md5': '8c7d272910eea320f6f8e6d3084eecf5', 'info_dict': { 'id': '5e779ebdd157bc0001d1c75a', 'ext': 'mp4', From 2562c9ec74b0690a36b6e87bef77524c2548e23f Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Sat, 3 Oct 2020 06:08:30 +0200 Subject: [PATCH 09/19] [nebula] Implement PoC of netrc authentication --- youtube_dl/extractor/nebula.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 4b8cca8fd6a..6566dc2d29e 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -1,11 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +import json import os from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote, compat_str -from ..utils import parse_iso8601, ExtractorError, try_get, urljoin +from ..utils import parse_iso8601, ExtractorError, try_get, urljoin, sanitized_Request class NebulaIE(InfoExtractor): @@ -68,6 +69,22 @@ class NebulaIE(InfoExtractor): }, ] _WORKING = True # FIXME: should this be set to False, to hide the tests from CI, given that the unit tests require an auth cookie of a (paid) subscription? + _NETRC_MACHINE = 'watchnebula' + + def _perform_login(self, username, password, video_id): + """ + Perform login to Nebula. + + Takes a username (email address) and password. Returns a Nebula token. + """ + data = json.dumps({'email': username, 'password': password}).encode('utf8') + request = sanitized_Request(method='POST', + url='https://api.watchnebula.com/api/v1/auth/login/', + data=data, + headers={'content-type': 'application/json'}) + response = self._download_json(request, fatal=True, video_id=video_id, + note='Logging in to Nebula') + return response['key'] def _retrieve_nebula_auth(self, video_id): """ @@ -79,6 +96,11 @@ def _retrieve_nebula_auth(self, video_id): # TODO: are these authentication methods, in this order, the best practice for youtube-dl? """ + + username, password = self._get_login_info() + nebula_token = self._perform_login(username, password, video_id) + return nebula_token + nebula_token = self._downloader.params.get('videopassword') if not nebula_token: # TODO: is there a helper to do all this cookie extraction? From 8b4c9da62a4828f9bd1cf46c70fa91c8410e02e3 Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Fri, 23 Oct 2020 05:43:55 +0200 Subject: [PATCH 10/19] [nebula] Clean up credentials-based authentication --- youtube_dl/extractor/nebula.py | 51 +++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 6566dc2d29e..e5e2b404819 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -15,12 +15,15 @@ class NebulaIE(InfoExtractor): off-YouTube from a small hand-picked group of creators. All videos require a subscription to watch. There are no known freely available videos. An authentication token to - an account with a valid subscription can be specified in multiple ways. + an account with a valid subscription can be specified in multiple ways, including credentials in .netrc or a cookie + jar. + As neither of these parameters appear to be supported by the unit test runner, it's recommended to set the envvar + NEBULA_TOKEN to execute the test runs. Nebula uses the Zype video infrastructure and this extractor is using the 'url_transparent' mode to hand off video extraction to the Zype extractor. - This description has been last updated on 2020-05-11. + This description has been last updated on 2020-10-22. """ _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P[-\w]+)' # the 'id' group is actually the display_id, but we misname it 'id' to be able to use _match_id() @@ -73,35 +76,44 @@ class NebulaIE(InfoExtractor): def _perform_login(self, username, password, video_id): """ - Perform login to Nebula. + Log in to Nebula, authenticating using a given username and password. - Takes a username (email address) and password. Returns a Nebula token. + Returns a Nebula token, as the frontend would store it in the + nebula-auth cookie. Or False, if authentication fails. """ data = json.dumps({'email': username, 'password': password}).encode('utf8') request = sanitized_Request(method='POST', url='https://api.watchnebula.com/api/v1/auth/login/', data=data, headers={'content-type': 'application/json'}) - response = self._download_json(request, fatal=True, video_id=video_id, - note='Logging in to Nebula') + response = self._download_json(request, fatal=False, video_id=video_id, + note='Authenticating to Nebula with supplied credentials', + errnote='Authentication failed or rejected') + if not response or 'key' not in response: + return False return response['key'] def _retrieve_nebula_auth(self, video_id): """ - Attempt to find a Nebula API token. Makes multiple attempts in the following order: - a) the --video-password command line argument + Attempt to find a Nebula API token. Makes multiple attempts in the + following order: + a) login credentials used to authenticate to the Nebula login endpoint, + either from .netrc or specified using --username/--password b) the --cookies supplied cookie jar c) the NEBULA_TOKEN environment variable - If none of these are successful, an end user-intended error message is returned, listing some solutions. - - # TODO: are these authentication methods, in this order, the best practice for youtube-dl? + d) the --video-password command line argument (this isn't documented in + the error message, because probably highly unpopular) + If none of these are successful, an end user-intended error message is + raised, listing some solutions. """ + nebula_token = None + # option #1: login credentials via .netrc or --username and --password username, password = self._get_login_info() - nebula_token = self._perform_login(username, password, video_id) - return nebula_token + if username and password: + nebula_token = self._perform_login(username, password, video_id) - nebula_token = self._downloader.params.get('videopassword') + # option #2: nebula token via cookie jar if not nebula_token: # TODO: is there a helper to do all this cookie extraction? nebula_cookies = self._get_cookies('https://watchnebula.com') @@ -109,12 +121,19 @@ def _retrieve_nebula_auth(self, video_id): if nebula_cookie: nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value) nebula_token = self._parse_json(nebula_cookie_value, video_id).get('apiToken') + + # option #3: nebula token via environment variable if not nebula_token and 'NEBULA_TOKEN' in os.environ: nebula_token = os.environ.get('NEBULA_TOKEN') + + # option #4: nebula token via --videopassword + if not nebula_token: + nebula_token = self._downloader.params.get('videopassword') + if not nebula_token: raise ExtractorError('Nebula requires an account with an active subscription. ' - 'You can supply a corresponding token by either ' - 'a) finding your nebula-auth cookie and then specifying it via --video-password, or ' + 'You can supply your authentication information by either ' + 'a) storing your credentials in .netrc or supplying them via --username and --password, or ' 'b) passing in a cookie jar containing a nebula-auth cookie via --cookies, or ' 'c) setting the environment variable NEBULA_TOKEN.') return nebula_token From 59c0e6e3d830addda6a03012580fbc36d0acff09 Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Wed, 25 Nov 2020 11:16:48 +0100 Subject: [PATCH 11/19] [nebula] Log attempted authentication method --- youtube_dl/extractor/nebula.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index e5e2b404819..c55535f6f50 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -111,6 +111,7 @@ def _retrieve_nebula_auth(self, video_id): # option #1: login credentials via .netrc or --username and --password username, password = self._get_login_info() if username and password: + self.to_screen('Authenticating to Nebula using .netrc or command line-supplied credentials') nebula_token = self._perform_login(username, password, video_id) # option #2: nebula token via cookie jar @@ -119,16 +120,20 @@ def _retrieve_nebula_auth(self, video_id): nebula_cookies = self._get_cookies('https://watchnebula.com') nebula_cookie = nebula_cookies.get('nebula-auth') if nebula_cookie: + self.to_screen('Authenticating to Nebula with credentials from cookie jar') nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value) nebula_token = self._parse_json(nebula_cookie_value, video_id).get('apiToken') # option #3: nebula token via environment variable if not nebula_token and 'NEBULA_TOKEN' in os.environ: nebula_token = os.environ.get('NEBULA_TOKEN') + if nebula_token: + self.to_screen('Authenticating to Nebula with token from NEBULA_TOKEN environment variable') # option #4: nebula token via --videopassword if not nebula_token: nebula_token = self._downloader.params.get('videopassword') + if nebula_token: self.to_screen('Authenticating to Nebula with token from --videopassword') if not nebula_token: raise ExtractorError('Nebula requires an account with an active subscription. ' From 9fdfd6d3ba84c83daa3d15d666602779ff54e5f3 Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Sun, 17 Jan 2021 15:46:33 +0100 Subject: [PATCH 12/19] [nebula] Prevent cookies from breaking Nebula auth When the 'sessionid' cookie is submitted to the `/auth/login/` endpoint, the response is always a 403. This typically happens when youtube_dl is run with both `--netrc` and `--cookies` as your default configuration. In that situation, the first authentication succeeds and stores the `sessionid` cookie in the cookie jar. During subsequent authentication attempts, the cookie is sent alongside and causes the authentication to fail. This is very unexpected and we therefore specifically handle this case. --- youtube_dl/extractor/nebula.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index c55535f6f50..0023a28834c 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -85,7 +85,12 @@ def _perform_login(self, username, password, video_id): request = sanitized_Request(method='POST', url='https://api.watchnebula.com/api/v1/auth/login/', data=data, - headers={'content-type': 'application/json'}) + headers={ + 'content-type': 'application/json', + # Overwrite the cookie headers, because + # submitting the 'sessionid' cookie + # always causes a 403 on auth endpoint + 'cookie': ''}) response = self._download_json(request, fatal=False, video_id=video_id, note='Authenticating to Nebula with supplied credentials', errnote='Authentication failed or rejected') @@ -105,6 +110,9 @@ def _retrieve_nebula_auth(self, video_id): the error message, because probably highly unpopular) If none of these are successful, an end user-intended error message is raised, listing some solutions. + + Returns a Nebula API token, which subsequently can be used to make + authenticated calls to the Nebula API. """ nebula_token = None From a0f69f95267780bf87bae2b252245a1145c1d5d6 Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Sun, 17 Jan 2021 15:49:12 +0100 Subject: [PATCH 13/19] [nebula] Fix stale session issues When Nebula isn't accessed for a while, the Zype access token stored on the Nebula backend expires. It is then no longer returned by the user endpoint. The Nebula frontend has the same issue and keeps polling for the Zype token in this case. This isn't implemented in this extractor yet, but at least a specific error message now prints some helpful advice. --- youtube_dl/extractor/nebula.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 0023a28834c..5b3c2cbf73c 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -141,7 +141,8 @@ def _retrieve_nebula_auth(self, video_id): # option #4: nebula token via --videopassword if not nebula_token: nebula_token = self._downloader.params.get('videopassword') - if nebula_token: self.to_screen('Authenticating to Nebula with token from --videopassword') + if nebula_token: + self.to_screen('Authenticating to Nebula with token from --videopassword') if not nebula_token: raise ExtractorError('Nebula requires an account with an active subscription. ' @@ -228,6 +229,8 @@ def _fetch_zype_access_token(self, video_id, nebula_token): user_object = self._call_nebula_api('/auth/user/', video_id, nebula_token, note='Retrieving Zype access token') access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str) if not access_token: + if try_get(user_object, lambda x: x['is_subscribed'], bool): + raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint, please try loading an arbitrary video in a browser with this account to ''prime'' it for video downloading') raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') return access_token From ddbadd037fd289de81a0e741dfce532e79821151 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 23 Nov 2024 10:31:42 +0000 Subject: [PATCH 14/19] Update PR with back-port from its development in yt-dlp --- docs/supportedsites.md | 1 - youtube_dl/extractor/extractors.py | 7 +- youtube_dl/extractor/nebula.py | 871 +++++++++++++++++++---------- 3 files changed, 568 insertions(+), 311 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a4372912e36..aa8026a322e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -569,7 +569,6 @@ - **ndr:embed** - **ndr:embed:base** - **NDTV** - - **Nebula** - **NerdCubedFeed** - **netease:album**: 网易云音乐 - 专辑 - **netease:djradio**: 网易云音乐 - 电台 diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9874441d5a6..1495eb5b4bb 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -731,7 +731,12 @@ NJoyEmbedIE, ) from .ndtv import NDTVIE -from .nebula import NebulaIE +from .nebula import ( + NebulaIE, + NebulaChannelIE, + NebulaClassIE, + NebulaSubscriptionsIE, +) from .nerdcubed import NerdCubedFeedIE from .netzkino import NetzkinoIE from .neteasemusic import ( diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 5b3c2cbf73c..3e55db9391d 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -1,320 +1,573 @@ # coding: utf-8 from __future__ import unicode_literals -import json -import os +import itertools +from .art19 import Art19IE from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote, compat_str -from ..utils import parse_iso8601, ExtractorError, try_get, urljoin, sanitized_Request - - -class NebulaIE(InfoExtractor): - """ - Nebula (https://watchnebula.com/) is a video platform created by the streamer community Standard. It hosts videos - off-YouTube from a small hand-picked group of creators. - - All videos require a subscription to watch. There are no known freely available videos. An authentication token to - an account with a valid subscription can be specified in multiple ways, including credentials in .netrc or a cookie - jar. - As neither of these parameters appear to be supported by the unit test runner, it's recommended to set the envvar - NEBULA_TOKEN to execute the test runs. - - Nebula uses the Zype video infrastructure and this extractor is using the 'url_transparent' mode to hand off - video extraction to the Zype extractor. - - This description has been last updated on 2020-10-22. - """ - - _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P[-\w]+)' # the 'id' group is actually the display_id, but we misname it 'id' to be able to use _match_id() - _TESTS = [ - { - 'url': 'https://watchnebula.com/videos/that-time-disney-remade-beauty-and-the-beast', - 'md5': 'fe79c4df8b3aa2fea98a93d027465c7e', - 'info_dict': { - 'id': '5c271b40b13fd613090034fd', - 'ext': 'mp4', - 'title': 'That Time Disney Remade Beauty and the Beast', - 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', - 'upload_date': '20180731', - 'timestamp': 1533009600, - 'channel': 'Lindsay Ellis', - 'uploader': 'Lindsay Ellis', - } +from ..compat import ( + compat_HTTPError as HTTPError, + compat_kwargs, + compat_str as str, +) +from ..utils import ( + ExtractorError, + int_or_none, + json_stringify, + # make_archive_id, + merge_dicts, + parse_iso8601, + smuggle_url, + str_or_none, + T, + traverse_obj, + try_call, + unsmuggle_url, + update_url, + url_basename, + url_or_none, + urljoin, +) + +_BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)' + + +class NebulaBaseIE(InfoExtractor): + _NETRC_MACHINE = 'watchnebula' + _token = _api_token = None + + def _real_initialize(self): + self._login() + + def _login(self): + if not self._api_token: + self._api_token = try_call( + lambda: self._get_cookies('https://nebula.tv')['nebula_auth.apiToken'].value) + self._token = self._download_json( + 'https://users.api.nebula.app/api/v1/authorization/', None, + headers={'Authorization': 'Token {0}'.format(self._api_token)} if self._api_token else {}, + note='Authorizing to Nebula', data=b'')['token'] + if self._token: + return + + username, password = self._get_login_info() + if username is None: + return + self._perform_login(username, password) + + def _perform_login(self, username, password): + try: + response = self._download_json( + 'https://nebula.tv/auth/login/', None, + 'Logging in to Nebula', 'Login failed', + data=json_stringify({'email': username, 'password': password}), + headers={'content-type': 'application/json'}) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: + raise ExtractorError('Login failed: Invalid username or password', expected=True) + raise + self._api_token = traverse_obj(response, ('key', T(str))) + if not self._api_token: + raise ExtractorError('Login failed: No token') + + def _call_api(self, *args, **kwargs): + + def kwargs_set_token(kw): + kw.setdefault('headers', {})['Authorization'] = 'Bearer {0}'.format(self._token) + return compat_kwargs(kw) + + if self._token: + kwargs = kwargs_set_token(kwargs) + try: + return self._download_json(*args, **kwargs) + except ExtractorError as e: + if not isinstance(e.cause, HTTPError) or e.cause.status not in (401, 403): + raise + self.to_screen( + 'Reauthorizing with Nebula and retrying, because last API ' + 'call resulted in error {0}'.format(e.cause.status)) + self._real_initialize() + if self._token: + kwargs = kwargs_set_token(kwargs) + return self._download_json(*args, **kwargs) + + def _extract_formats(self, content_id, slug): + for retry in (False, True): + try: + # fmts, subs = self._extract_m3u8_formats_and_subtitles( + fmts, subs = self._extract_m3u8_formats( + 'https://content.api.nebula.app/{0}s/{1}/manifest.m3u8'.format( + content_id.split(':', 1)[0], content_id), + slug, 'mp4', query={ + 'token': self._token, + 'app_version': '23.10.0', + 'platform': 'ios', + }), {} + self._sort_formats(fmts) + return {'formats': fmts, 'subtitles': subs} + except ExtractorError as e: + if not isinstance(e.cause, HTTPError): + raise + if e.cause.status == 401: + self.raise_login_required() + if not retry and e.cause.status == 403: + self.to_screen('Reauthorizing with Nebula and retrying, because fetching video resulted in error') + self._real_initialize() + continue + raise + + def _extract_video_metadata(self, episode): + channel_url = traverse_obj( + episode, (('channel_slug', 'class_slug'), T(lambda u: urljoin('https://nebula.tv/', u))), get_all=False) + return merge_dicts({ + 'id': episode['id'].partition(':')[2], + 'title': episode['title'], + 'channel_url': channel_url, + 'uploader_url': channel_url, + }, traverse_obj(episode, { + 'display_id': 'slug', + 'description': 'description', + 'timestamp': ('published_at', T(parse_iso8601)), + 'duration': ('duration', T(int_or_none)), + 'channel_id': 'channel_slug', + 'uploader_id': 'channel_slug', + 'channel': 'channel_title', + 'uploader': 'channel_title', + 'series': 'channel_title', + 'creator': 'channel_title', + 'thumbnail': ('images', 'thumbnail', 'src', T(url_or_none)), + 'episode_number': ('order', {int_or_none}), + # Old code was wrongly setting extractor_key from NebulaSubscriptionsIE + # '_old_archive_ids': ('zype_id', {lambda x: [ + # make_archive_id(NebulaIE, x), make_archive_id(NebulaSubscriptionsIE, x)] if x else None}), + })) + + +class NebulaIE(NebulaBaseIE): + IE_NAME = 'nebula:video' + _VALID_URL = r'{0}/videos/(?P[\w-]+)'.format(_BASE_URL_RE) + _TESTS = [{ + 'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast', + 'info_dict': { + 'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf', + 'ext': 'mp4', + 'title': 'That Time Disney Remade Beauty and the Beast', + 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', + 'upload_date': '20180731', + 'timestamp': 1533009600, + 'channel': 'Lindsay Ellis', + 'channel_id': 'lindsayellis', + 'uploader': 'Lindsay Ellis', + 'uploader_id': 'lindsayellis', + 'uploader_url': r're:https://nebula\.(tv|app)/lindsayellis', + 'series': 'Lindsay Ellis', + 'display_id': 'that-time-disney-remade-beauty-and-the-beast', + 'channel_url': r're:https://nebula\.(tv|app)/lindsayellis', + 'creator': 'Lindsay Ellis', + 'duration': 2212, + 'thumbnail': r're:https?://images\.nebula\.tv/[a-f\d-]+$', + # '_old_archive_ids': ['nebula 5c271b40b13fd613090034fd', 'nebulasubscriptions 5c271b40b13fd613090034fd'], }, - { - 'url': 'https://watchnebula.com/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', - 'md5': '6d4edd14ce65720fa63aba5c583fb328', - 'info_dict': { - 'id': '5e7e78171aaf320001fbd6be', - 'ext': 'mp4', - 'title': 'Landing Craft - How The Allies Got Ashore', - 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', - 'upload_date': '20200327', - 'timestamp': 1585348140, - 'channel': 'The Logistics of D-Day', - 'uploader': 'The Logistics of D-Day', - } + 'params': { + 'format': 'bestvideo', + 'skip_download': 'm3u8', }, - { - 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', - 'md5': '8c7d272910eea320f6f8e6d3084eecf5', - 'info_dict': { - 'id': '5e779ebdd157bc0001d1c75a', - 'ext': 'mp4', - 'title': 'Episode 1: The Draw', - 'description': r'contains:There’s free money on offer… if the players can all work together.', - 'upload_date': '20200323', - 'timestamp': 1584980400, - 'channel': 'Tom Scott Presents: Money', - 'uploader': 'Tom Scott Presents: Money', - } + }, { + 'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', + 'md5': 'd05739cf6c38c09322422f696b569c23', + 'info_dict': { + 'id': '7e623145-1b44-4ca3-aa0b-ed25a247ea34', + 'ext': 'mp4', + 'title': 'Landing Craft - How The Allies Got Ashore', + 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', + 'upload_date': '20200327', + 'timestamp': 1585348140, + 'channel': 'Real Engineering — The Logistics of D-Day', + 'channel_id': 'd-day', + 'uploader': 'Real Engineering — The Logistics of D-Day', + 'uploader_id': 'd-day', + 'series': 'Real Engineering — The Logistics of D-Day', + 'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', + 'creator': 'Real Engineering — The Logistics of D-Day', + 'duration': 841, + 'channel_url': 'https://nebula.tv/d-day', + 'uploader_url': 'https://nebula.tv/d-day', + 'thumbnail': r're:https?://images\.nebula\.tv/[a-f\d-]+$', + # '_old_archive_ids': ['nebula 5e7e78171aaf320001fbd6be', 'nebulasubscriptions 5e7e78171aaf320001fbd6be'], }, - ] - _WORKING = True # FIXME: should this be set to False, to hide the tests from CI, given that the unit tests require an auth cookie of a (paid) subscription? - _NETRC_MACHINE = 'watchnebula' + 'params': { + 'format': 'bestvideo', + 'skip_download': 'm3u8', + }, + 'skip': 'Only available for registered users', + }, { + 'url': 'https://nebula.tv/videos/money-episode-1-the-draw', + 'md5': 'ebe28a7ad822b9ee172387d860487868', + 'info_dict': { + 'id': 'b96c5714-9e2b-4ec3-b3f1-20f6e89cc553', + 'ext': 'mp4', + 'title': 'Episode 1: The Draw', + 'description': r'contains:There’s free money on offer… if the players can all work together.', + 'upload_date': '20200323', + 'timestamp': 1584980400, + 'channel': 'Tom Scott Presents: Money', + 'channel_id': 'tom-scott-presents-money', + 'uploader': 'Tom Scott Presents: Money', + 'uploader_id': 'tom-scott-presents-money', + 'uploader_url': 'https://nebula.tv/tom-scott-presents-money', + 'duration': 825, + 'channel_url': 'https://nebula.tv/tom-scott-presents-money', + 'series': 'Tom Scott Presents: Money', + 'display_id': 'money-episode-1-the-draw', + 'thumbnail': r're:https?://images\.nebula\.tv/[a-f\d-]+$', + # '_old_archive_ids': ['nebula 5e779ebdd157bc0001d1c75a', 'nebulasubscriptions 5e779ebdd157bc0001d1c75a'], + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': 'm3u8', + }, + 'skip': 'Only available for registered users', + }, { + 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', + 'only_matching': True, + }, { + 'url': 'https://nebula.tv/videos/tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines', + 'info_dict': { + 'id': 'e389af9d-1dab-44f2-8788-ee24deb7ff0d', + 'ext': 'mp4', + 'display_id': 'tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines', + 'title': 'Did the US Really Blow Up the NordStream Pipelines?', + 'description': 'md5:b4e2a14e3ff08f546a3209c75261e789', + 'upload_date': '20230223', + 'timestamp': 1677144070, + 'channel': 'TLDR News EU', + 'channel_id': 'tldrnewseu', + 'uploader': 'TLDR News EU', + 'uploader_id': 'tldrnewseu', + 'uploader_url': r're:https://nebula\.(tv|app)/tldrnewseu', + 'duration': 524, + 'channel_url': r're:https://nebula\.(tv|app)/tldrnewseu', + 'series': 'TLDR News EU', + 'thumbnail': r're:https?://images\.nebula\.tv/[a-f\d-]+$', + 'creator': 'TLDR News EU', + # '_old_archive_ids': ['nebula 63f64c74366fcd00017c1513', 'nebulasubscriptions 63f64c74366fcd00017c1513'], + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': 'm3u8', + }, + }, { + 'url': 'https://beta.nebula.tv/videos/money-episode-1-the-draw', + 'only_matching': True, + }] - def _perform_login(self, username, password, video_id): - """ - Log in to Nebula, authenticating using a given username and password. - - Returns a Nebula token, as the frontend would store it in the - nebula-auth cookie. Or False, if authentication fails. - """ - data = json.dumps({'email': username, 'password': password}).encode('utf8') - request = sanitized_Request(method='POST', - url='https://api.watchnebula.com/api/v1/auth/login/', - data=data, - headers={ - 'content-type': 'application/json', - # Overwrite the cookie headers, because - # submitting the 'sessionid' cookie - # always causes a 403 on auth endpoint - 'cookie': ''}) - response = self._download_json(request, fatal=False, video_id=video_id, - note='Authenticating to Nebula with supplied credentials', - errnote='Authentication failed or rejected') - if not response or 'key' not in response: - return False - return response['key'] - - def _retrieve_nebula_auth(self, video_id): - """ - Attempt to find a Nebula API token. Makes multiple attempts in the - following order: - a) login credentials used to authenticate to the Nebula login endpoint, - either from .netrc or specified using --username/--password - b) the --cookies supplied cookie jar - c) the NEBULA_TOKEN environment variable - d) the --video-password command line argument (this isn't documented in - the error message, because probably highly unpopular) - If none of these are successful, an end user-intended error message is - raised, listing some solutions. - - Returns a Nebula API token, which subsequently can be used to make - authenticated calls to the Nebula API. - """ - nebula_token = None - - # option #1: login credentials via .netrc or --username and --password - username, password = self._get_login_info() - if username and password: - self.to_screen('Authenticating to Nebula using .netrc or command line-supplied credentials') - nebula_token = self._perform_login(username, password, video_id) - - # option #2: nebula token via cookie jar - if not nebula_token: - # TODO: is there a helper to do all this cookie extraction? - nebula_cookies = self._get_cookies('https://watchnebula.com') - nebula_cookie = nebula_cookies.get('nebula-auth') - if nebula_cookie: - self.to_screen('Authenticating to Nebula with credentials from cookie jar') - nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value) - nebula_token = self._parse_json(nebula_cookie_value, video_id).get('apiToken') - - # option #3: nebula token via environment variable - if not nebula_token and 'NEBULA_TOKEN' in os.environ: - nebula_token = os.environ.get('NEBULA_TOKEN') - if nebula_token: - self.to_screen('Authenticating to Nebula with token from NEBULA_TOKEN environment variable') - - # option #4: nebula token via --videopassword - if not nebula_token: - nebula_token = self._downloader.params.get('videopassword') - if nebula_token: - self.to_screen('Authenticating to Nebula with token from --videopassword') - - if not nebula_token: - raise ExtractorError('Nebula requires an account with an active subscription. ' - 'You can supply your authentication information by either ' - 'a) storing your credentials in .netrc or supplying them via --username and --password, or ' - 'b) passing in a cookie jar containing a nebula-auth cookie via --cookies, or ' - 'c) setting the environment variable NEBULA_TOKEN.') - return nebula_token - - def _retrieve_zype_api_key(self, page_url, display_id): - """ - Retrieves the Zype API key required to make calls to the Zype API. - - Unfortunately, the Nebula frontend stores this as a JS object literal in one of its JS chunks, - looking somewhat like this (but minified): - - return { - NODE_ENV: "production", - REACT_APP_NAME: "Nebula", - REACT_APP_NEBULA_API: "https://api.watchnebula.com/api/v1/", - REACT_APP_ZYPE_API: "https://api.zype.com/", - REACT_APP_ZYPE_API_KEY: "", - REACT_APP_ZYPE_APP_KEY: "", - // ... - } - - So we have to find the reference to the chunk in the video page (as it is hashed and the hash will - change when they do a new release), then download the chunk and extract the API key from there, - hoping they won't rename the constant. - - Alternatively, it is currently hardcoded and shared among all users. We haven't seen it - change so far, so we could also just hardcode it in the extractor as a fallback. - """ - # fetch the video page - webpage = self._download_webpage(page_url, video_id=display_id) - - # find the script tag with a file named 'main..chunk.js' in there - main_script_relpath = self._search_regex( - r']*src="(?P[^"]*main.[0-9a-f]*.chunk.js)"[^>]*>', webpage, - group='script_relpath', name='script relative path', fatal=True) - - # fetch the JS chunk - main_script_abspath = urljoin(page_url, main_script_relpath) - main_script = self._download_webpage(main_script_abspath, video_id=display_id, - note='Retrieving Zype API key') - - # find the API key named 'REACT_APP_ZYPE_API_KEY' in there - api_key = self._search_regex( - r'REACT_APP_ZYPE_API_KEY\s*:\s*"(?P[\w-]*)"', main_script, - group='api_key', name='API key', fatal=True) - - return api_key - - def _call_zype_api(self, path, params, video_id, api_key, note): - """ - A helper for making calls to the Zype API. - """ - query = {'api_key': api_key, 'per_page': 1} - query.update(params) - return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note) - - def _fetch_zype_video_data(self, display_id, api_key): - """ - Fetch video meta data from the Zype API. - """ - response = self._call_zype_api('/videos', {'friendly_title': display_id}, - display_id, api_key, note='Retrieving metadata from Zype') - if 'response' not in response or len(response['response']) != 1: - raise ExtractorError('Unable to find video on Zype API') - return response['response'][0] - - def _call_nebula_api(self, path, video_id, access_token, note): - """ - A helper for making calls to the Nebula API. - """ - return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={ - 'Authorization': 'Token {access_token}'.format(access_token=access_token) - }, note=note) - - def _fetch_zype_access_token(self, video_id, nebula_token): - """ - Requests a Zype access token from the Nebula API. - """ - user_object = self._call_nebula_api('/auth/user/', video_id, nebula_token, note='Retrieving Zype access token') - access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str) - if not access_token: - if try_get(user_object, lambda x: x['is_subscribed'], bool): - raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint, please try loading an arbitrary video in a browser with this account to ''prime'' it for video downloading') - raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') - return access_token - - def _build_video_url(self, video_id, zype_access_token): - """ - Construct a Zype video URL (as supported by the Zype extractor), given a Zype video ID and a Zype access token. - """ - return 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format( - video_id=video_id, - access_token=zype_access_token) - - def _extract_channel(self, video_meta): - """ - Extract the channel title, by going through the list of categories and finding the first value of the - first category that has a value. - - I know this look like a terrible approach. But actually, it's just reproducing the behavior of the - React code the Nebula frontend uses (as of 2020-04-07): - - let channel; - if (video && video.categories && video.categories.length) { - const channelTitle = video.categories.map((category) => (category.value[0])) - .filter((title) => (!!title))[0]; - channel = getChannelByTitle(state, { title: channelTitle }); - } - - Basically, it finds the first (truthy) value in the category list and that's assumed to be the - channel title. And then the channel details (e.g. the URL) are looked up by title (!) (not by any - kind of ID) via an additional API call. - - TODO: Implement the API calls giving us the channel list, so that we can do the title lookup and then figure out the channel URL - - May return None of no category list could be found or no category had a label ('value'). - """ - categories = video_meta.get('categories', []) if video_meta else [] - for category in categories: - if category.get('value'): # we're intentionally not using "'value' in category" here, because the expression is supposed to be falsy for empty lists in category['value'] as well! - return category['value'][0] + def _real_extract(self, url): + slug = self._match_id(url) + url, smuggled_data = unsmuggle_url(url, {}) + if smuggled_data.get('id'): + return merge_dicts({ + 'id': smuggled_data['id'], + 'display_id': slug, + 'title': '', + }, self._extract_formats(smuggled_data['id'], slug)) + + metadata = self._call_api( + 'https://content.api.nebula.app/content/videos/{0}'.format(slug), + slug, note='Fetching video metadata') + return merge_dicts( + self._extract_video_metadata(metadata), + self._extract_formats(metadata['id'], slug), + rev=True + ) + + +class NebulaClassIE(NebulaBaseIE): + IE_NAME = 'nebula:media' + _VALID_URL = r'{0}/(?!(?:myshows|library|videos)/)(?P[\w-]+)/(?P[\w-]+)/?(?:$|[?#])'.format(_BASE_URL_RE) + _TESTS = [{ + 'url': 'https://nebula.tv/copyright-for-fun-and-profit/14', + 'info_dict': { + 'id': 'd7432cdc-c608-474d-942c-f74345daed7b', + 'ext': 'mp4', + 'display_id': '14', + 'channel_url': 'https://nebula.tv/copyright-for-fun-and-profit', + 'episode_number': 14, + 'thumbnail': r're:https?://images\.nebula\.tv/[a-f\d-]+$', + 'uploader_url': 'https://nebula.tv/copyright-for-fun-and-profit', + 'duration': 646, + 'episode': 'Episode 14', + 'title': 'Photos, Sculpture, and Video', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': 'm3u8', + }, + 'skip': 'Only available for registered users', + }, { + 'add_ies': [Art19IE], + 'url': 'https://nebula.tv/extremitiespodcast/pyramiden-the-high-arctic-soviet-ghost-town', + 'info_dict': { + 'ext': 'mp3', + 'id': '83ef3b53-049e-4211-b34e-7bb518e67d64', + 'description': r"re:(?s)20 years ago, what was previously the Soviet Union's .{467}#do-not-sell-my-info\.$", + 'series_id': 'e0223cfc-f39c-4ad4-8724-bd8731bd31b5', + 'modified_timestamp': 1629410982, + 'episode_id': '83ef3b53-049e-4211-b34e-7bb518e67d64', + 'series': 'Extremities', + # 'modified_date': '20200903', + 'upload_date': '20200902', + 'title': 'Pyramiden: The High-Arctic Soviet Ghost Town', + 'release_timestamp': 1571237958, + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'duration': 1546.05714, + 'timestamp': 1599085555, + 'release_date': '20191016', + }, + }, { + 'url': 'https://nebula.tv/thelayover/the-layover-episode-1', + 'info_dict': { + 'ext': 'mp3', + 'id': '9d74a762-00bb-45a8-9e8d-9ed47c04a1d0', + 'episode_number': 1, + 'thumbnail': r're:https?://images\.nebula\.tv/[a-f\d-]+$', + 'release_date': '20230304', + 'modified_date': '20230403', + 'series': 'The Layover', + 'episode_id': '9d74a762-00bb-45a8-9e8d-9ed47c04a1d0', + 'modified_timestamp': 1680554566, + 'duration': 3130.46401, + 'release_timestamp': 1677943800, + 'title': 'The Layover — Episode 1', + 'series_id': '874303a5-4900-4626-a4b6-2aacac34466a', + 'upload_date': '20230303', + 'episode': 'Episode 1', + 'timestamp': 1677883672, + 'description': 'md5:002cca89258e3bc7c268d5b8c24ba482', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': 'm3u8', + }, + 'skip': 'Only available for registered users', + }] + + def _real_extract(self, url): + slug, episode = self._match_valid_url(url).group('id', 'ep') + url, smuggled_data = unsmuggle_url(url, {}) + if smuggled_data.get('id'): + return merge_dicts({ + 'id': smuggled_data['id'], + 'display_id': slug, + 'title': '', + }, self._extract_formats(smuggled_data['id'], slug)) + + metadata = self._call_api( + 'https://content.api.nebula.app/content/{0}/{1}/?include=lessons'.format( + slug, episode), + slug, note='Fetching class/podcast metadata') + content_type = traverse_obj(metadata, 'type') + if content_type == 'lesson': + return merge_dicts( + self._extract_video_metadata(metadata), + self._extract_formats(metadata['id'], slug)) + elif content_type == 'podcast_episode': + episode_url = metadata.get('episode_url') + if not episode_url and metadata.get('premium'): + self.raise_login_required() + + if Art19IE.suitable(episode_url): + return self.url_result(episode_url, Art19IE.ie_key()) + return merge_dicts({ + 'id': metadata['id'], + 'title': metadata['title'], + }, traverse_obj(metadata, { + 'url': ('episode_url', T(url_or_none)), + 'description': ('description', T(str_or_none)), + 'timestamp': ('published_at', T(parse_iso8601)), + 'duration': ('duration', T(int_or_none)), + 'channel_id': ('channel_id', T(str_or_none)), + 'channel': ('channel_title', T(str_or_none)), + 'thumbnail': ('assets', 'regular', T(url_or_none)), + })) + + raise ExtractorError('Unexpected content type {0!r}'.format(content_type)) + + +class NebulaPlaylistBaseIE(NebulaBaseIE): + _BASE_API_URL = 'https://content.api.nebula.app/' + _API_QUERY = {'ordering': '-published_at'} + + @classmethod + def _get_api_url(cls, item_id, path='/video_episodes/'): + return update_url(cls._BASE_API_URL, path=path, query_update=cls._API_QUERY) + + @staticmethod + def _get_episode_url(episode, episode_id): + return 'https://nebula.tv/videos/{0}'.format(episode_id) + + @classmethod + def url_result(cls, url, *args, **kwargs): + url_transparent = kwargs.pop('url_transparent', False) + smuggled_data = kwargs.pop('smuggled_data', None) + if smuggled_data: + url = smuggle_url(url, smuggled_data) + ie_key = args[0] if len(args) > 0 else kwargs.get('ie_key') + if not ie_key: + args = (NebulaIE.ie_key(),) + args + return merge_dicts( + {'_type': 'url_transparent'} if url_transparent else {}, + super(NebulaPlaylistBaseIE, cls).url_result(url, *args), + **kwargs) + + def _generate_playlist_entries(self, pl_id=None, slug=None, dl_note=None): + next_url = self._get_api_url(pl_id) + if dl_note is None: + dl_note = self.IE_NAME.rpartition(':')[::2] + if dl_note[0] and dl_note[1]: + dl_note = '{0} '.format(dl_note[1]) + else: + dl_note = '' + slug = slug or pl_id + for page_num in itertools.count(1): + episodes = self._call_api( + next_url, slug, note='Retrieving {0}page {1}'.format( + dl_note, page_num)) + for episode in traverse_obj(episodes, ('results', Ellipsis)): + metadata = self._extract_video_metadata(episode) + yield self.url_result( + self._get_episode_url(episode, metadata['display_id']), + smuggled_data={'id': episode['id']}, url_transparent=True, + **metadata) + next_url = episodes.get('next') + if not next_url: + break + + +class NebulaSubscriptionsIE(NebulaPlaylistBaseIE): + IE_NAME = 'nebula:subscriptions' + _VALID_URL = r'{0}/myshows'.format(_BASE_URL_RE) + _API_QUERY = { + 'following': 'true', + 'include': 'engagement', + 'ordering': '-published_at', + } + _TESTS = [{ + 'url': 'https://nebula.tv/myshows', + 'playlist_mincount': 1, + 'info_dict': { + 'id': 'myshows', + }, + 'skip': 'You must be logged in to find your subscriptions', + }] + + def _call_api(self, *args, **kwargs): + + try: + return super(NebulaSubscriptionsIE, self)._call_api(*args, **kwargs) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: + self.raise_login_required('You must be logged in to find your subscriptions') + raise + + def _real_extract(self, url): + slug = url_basename(url) + return self.playlist_result(self._generate_playlist_entries(slug), slug) + + +class NebulaChannelIE(NebulaPlaylistBaseIE): + IE_NAME = 'nebula:channel' + _VALID_URL = r'{0}/(?!myshows|library|videos)(?P[\w-]+)/?(?:$|[?#])'.format(_BASE_URL_RE) + _TESTS = [{ + 'url': 'https://nebula.tv/tom-scott-presents-money', + 'info_dict': { + 'id': 'tom-scott-presents-money', + 'title': 'Tom Scott Presents: Money', + 'description': 'Tom Scott hosts a series all about trust, negotiation and money.', + }, + 'playlist_count': 5, + }, { + 'url': 'https://nebula.tv/lindsayellis', + 'info_dict': { + 'id': 'lindsayellis', + 'title': 'Lindsay Ellis', + 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://nebula.tv/johnnyharris', + 'info_dict': { + 'id': 'johnnyharris', + 'title': 'Johnny Harris', + 'description': 'I make videos about maps and many other things.', + }, + 'playlist_mincount': 90, + }, { + 'url': 'https://nebula.tv/copyright-for-fun-and-profit', + 'info_dict': { + 'id': 'copyright-for-fun-and-profit', + 'title': 'Copyright for Fun and Profit', + 'description': 'md5:6690248223eed044a9f11cd5a24f9742', + }, + 'playlist_count': 23, + }, { + 'url': 'https://nebula.tv/trussissuespodcast', + 'info_dict': { + 'id': 'trussissuespodcast', + 'title': 'Bite the Ballot', + 'description': 'md5:a08c4483bc0b705881d3e0199e721385', + }, + 'playlist_mincount': 80, + }] + + @classmethod + def _get_api_url(cls, item_id, path='/video_channels/{0}/video_episodes/'): + return super(NebulaChannelIE, cls)._get_api_url( + item_id, path=path.format(item_id)) + + @classmethod + def _get_episode_url(cls, episode, episode_id): + return ( + episode.get('share_url') + or super(NebulaChannelIE, cls)._get_episode_url(episode, episode_id)) + + def _generate_class_entries(self, channel): + for lesson in traverse_obj(channel, ('lessons', Ellipsis)): + metadata = self._extract_video_metadata(lesson) + yield self.url_result( + lesson.get('share_url') or 'https://nebula.tv/{0}/{1}'.format( + metadata['class_slug'], metadata['slug']), + smuggled_data={'id': lesson['id']}, url_transparent=True, + **metadata) + + def _generate_podcast_entries(self, collection_id, collection_slug): + next_url = 'https://content.api.nebula.app/podcast_channels/{0}/podcast_episodes/?ordering=-published_at&premium=true'.format( + collection_id) + for page_num in itertools.count(1): + episodes = self._call_api(next_url, collection_slug, note='Retrieving podcast page {0}'.format(page_num)) + + for episode in traverse_obj(episodes, ('results', lambda _, v: url_or_none(v['share_url']))): + yield self.url_result(episode['share_url'], NebulaClassIE) + next_url = episodes.get('next') + if not next_url: + break def _real_extract(self, url): - # extract the video's display ID from the URL (we'll retrieve the video ID later) - display_id = self._match_id(url) - - # retrieve Nebula authentication information - nebula_token = self._retrieve_nebula_auth(display_id) - - # fetch video meta data from the Nebula API - api_key = self._retrieve_zype_api_key(url, display_id) - video_meta = self._fetch_zype_video_data(display_id, api_key) - video_id = video_meta['_id'] - - # extract additional info - channel_title = self._extract_channel(video_meta) - - # fetch the access token for Zype, then construct the video URL - zype_access_token = self._fetch_zype_access_token(display_id, nebula_token=nebula_token) - video_url = self._build_video_url(video_id, zype_access_token) - - return { - 'id': video_id, - 'display_id': display_id, - - # we're passing this video URL on to the 'Zype' extractor (that's the video infrastructure that Nebula is - # built on top of) and use the 'url_transparent' type to indicate that our meta data should be better than - # whatever the Zype extractor is able to identify - '_type': 'url_transparent', - 'ie_key': 'Zype', - 'url': video_url, - - # the meta data we were able to extract from Nebula - 'title': video_meta.get('title'), - 'description': video_meta.get('description'), - 'timestamp': parse_iso8601(video_meta.get('published_at')), - 'thumbnails': [ - { - 'id': tn.get('name'), # this appears to be null in all cases I've encountered - 'url': tn['url'], - 'width': tn.get('width'), - 'height': tn.get('height'), - } for tn in video_meta.get('thumbnails', [])], - 'duration': video_meta.get('duration'), - 'channel': channel_title, - 'uploader': channel_title, # we chose here to declare the channel name as the 'uploader' -- that's certainly arguable, as sometimes it's more of a series - # TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from! - # TODO: channel_id - # TODO: channel_url - } + collection_slug = self._match_id(url) + channel = self._call_api( + 'https://content.api.nebula.app/content/{0}/?include=lessons'.format( + collection_slug), + collection_slug, note='Retrieving channel') + + channel_type = traverse_obj(channel, 'type') + if channel_type == 'class': + entries = self._generate_class_entries(channel) + elif channel_type == 'podcast_channel': + entries = self._generate_podcast_entries(channel['id'], collection_slug) + else: + entries = self._generate_playlist_entries(channel['id'], collection_slug) + + return self.playlist_result( + entries, + playlist_id=collection_slug, + playlist_title=channel.get('title'), + playlist_description=channel.get('description')) From 88619125c8411da1fc280446a8a0ca8d7527599b Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 23 Nov 2024 10:39:54 +0000 Subject: [PATCH 15/19] Create art19.py --- youtube_dl/extractor/art19.py | 315 ++++++++++++++++++++++++++++++++++ 1 file changed, 315 insertions(+) create mode 100644 youtube_dl/extractor/art19.py diff --git a/youtube_dl/extractor/art19.py b/youtube_dl/extractor/art19.py new file mode 100644 index 00000000000..f47deadab09 --- /dev/null +++ b/youtube_dl/extractor/art19.py @@ -0,0 +1,315 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + merge_dicts, + parse_iso8601, + str_or_none, + T, + traverse_obj, + url_or_none, +) + + +class Art19IE(InfoExtractor): + _UUID_REGEX = r'[\da-f]{8}-?[\da-f]{4}-?[\da-f]{4}-?[\da-f]{4}-?[\da-f]{12}' + _VALID_URL = ( + r'https?://(?:www\.)?art19\.com/shows/[^/#?]+/episodes/(?P{0})'.format(_UUID_REGEX), + r'https?://rss\.art19\.com/episodes/(?P{0})\.mp3'.format(_UUID_REGEX), + ) + _EMBED_REGEX = (r']+\bsrc\s*=\s*[\'"](?P{0})'.format(_VALID_URL[0]),) + + _TESTS = [{ + 'url': 'https://rss.art19.com/episodes/5ba1413c-48b8-472b-9cc3-cfd952340bdb.mp3', + 'info_dict': { + 'id': '5ba1413c-48b8-472b-9cc3-cfd952340bdb', + 'ext': 'mp3', + 'title': 'Why Did DeSantis Drop Out?', + 'series': 'The Daily Briefing', + 'release_timestamp': 1705941275, + 'description': 'md5:da38961da4a3f7e419471365e3c6b49f', + 'episode': 'Episode 582', + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'series_id': 'ed52a0ab-08b1-4def-8afc-549e4d93296d', + 'upload_date': '20240122', + 'timestamp': 1705940815, + 'episode_number': 582, + # 'modified_date': '20240122', + 'episode_id': '5ba1413c-48b8-472b-9cc3-cfd952340bdb', + 'modified_timestamp': int, + 'release_date': '20240122', + 'duration': 527.4, + }, + }, { + 'url': 'https://art19.com/shows/scamfluencers/episodes/8319b776-4153-4d22-8630-631f204a03dd', + 'info_dict': { + 'id': '8319b776-4153-4d22-8630-631f204a03dd', + 'ext': 'mp3', + 'title': 'Martha Stewart: The Homemaker Hustler Part 2', + # 'modified_date': '20240116', + 'upload_date': '20240105', + 'modified_timestamp': int, + 'episode_id': '8319b776-4153-4d22-8630-631f204a03dd', + 'series_id': 'd3c9b8ca-26b3-42f4-9bd8-21d1a9031e75', + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'description': r're:(?s)In the summer of 2003, Martha Stewart is indicted .{695}#do-not-sell-my-info\.$', + 'release_timestamp': 1705305660, + 'release_date': '20240115', + 'timestamp': 1704481536, + 'episode_number': 88, + 'series': 'Scamfluencers', + 'duration': 2588.37501, + 'episode': 'Episode 88', + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.nu.nl/formule-1/6291456/verstappen-wordt-een-synoniem-voor-formule-1.html', + 'info_dict': { + 'id': '7d42626a-7301-47db-bb8a-3b6f054d77d7', + 'ext': 'mp3', + 'title': "'Verstappen wordt een synoniem voor Formule 1'", + 'season': 'Seizoen 6', + 'description': 'md5:39a7159a31c4cda312b2e893bdd5c071', + 'episode_id': '7d42626a-7301-47db-bb8a-3b6f054d77d7', + 'duration': 3061.82111, + 'series_id': '93f4e113-2a60-4609-a564-755058fa40d8', + 'release_date': '20231126', + 'modified_timestamp': 1701156004, + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'season_number': 6, + 'episode_number': 52, + # 'modified_date': '20231128', + 'upload_date': '20231126', + 'timestamp': 1701025981, + 'season_id': '36097c1e-7455-490d-a2fe-e2f10b4d5f26', + 'series': 'De Boordradio', + 'release_timestamp': 1701026308, + 'episode': 'Episode 52', + }, + }, { + 'url': 'https://www.wishtv.com/podcast-episode/larry-bucshon-announces-retirement-from-congress/', + 'info_dict': { + 'id': '8da368bd-08d1-46d0-afaa-c134a4af7dc0', + 'ext': 'mp3', + 'title': 'Larry Bucshon announces retirement from congress', + 'upload_date': '20240115', + 'episode_number': 148, + 'episode': 'Episode 148', + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'release_date': '20240115', + 'timestamp': 1705328205, + 'release_timestamp': 1705329275, + 'series': 'All INdiana Politics', + # 'modified_date': '20240117', + 'modified_timestamp': 1705458901, + 'series_id': 'c4af6c27-b10f-4ff2-9f84-0f407df86ff1', + 'episode_id': '8da368bd-08d1-46d0-afaa-c134a4af7dc0', + 'description': 'md5:53b5239e4d14973a87125c217c255b2a', + 'duration': 1256.18848, + }, + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + for from_ in super(Art19IE, cls)._extract_embed_urls(url, webpage): + yield from_ + for episode_id in re.findall( + r']+\bclass\s*=\s*[\'"][^\'"]*art19-web-player[^\'"]*[\'"][^>]+\bdata-episode-id=[\'"]({0})[\'"]'.format(cls._UUID_REGEX), webpage): + yield 'https://rss.art19.com/episodes/{0}.mp3'.format(episode_id) + + def _real_extract(self, url): + episode_id = self._match_id(url) + + player_metadata = self._download_json( + 'https://art19.com/episodes/{0}'.format(episode_id), episode_id, + note='Downloading player metadata', fatal=False, + headers={'Accept': 'application/vnd.art19.v0+json'}) + rss_metadata = self._download_json( + 'https://rss.art19.com/episodes/{0}.json'.format(episode_id), episode_id, + fatal=False, note='Downloading RSS metadata') + + formats = [{ + 'format_id': 'direct', + 'url': 'https://rss.art19.com/episodes/{0}.mp3'.format(episode_id), + 'vcodec': 'none', + 'acodec': 'mp3', + }] + for fmt_id, fmt_data in traverse_obj(rss_metadata, ( + 'content', 'media', T(dict.items), + lambda _, k_v: k_v[0] != 'waveform_bin' and k_v[1].get('url'))): + fmt_url = url_or_none(fmt_data['url']) + if not fmt_url: + continue + formats.append({ + 'format_id': fmt_id, + 'url': fmt_url, + 'vcodec': 'none', + 'acodec': fmt_id, + 'quality': -2 if fmt_id == 'ogg' else -1, + }) + + self._sort_formats(formats) + + return merge_dicts({ + 'id': episode_id, + 'formats': formats, + }, traverse_obj(player_metadata, ('episode', { + 'title': ('title', T(str_or_none)), + 'description': ('description_plain', T(str_or_none)), + 'episode_id': ('id', T(str_or_none)), + 'episode_number': ('episode_number', T(int_or_none)), + 'season_id': ('season_id', T(str_or_none)), + 'series_id': ('series_id', T(str_or_none)), + 'timestamp': ('created_at', T(parse_iso8601)), + 'release_timestamp': ('released_at', T(parse_iso8601)), + 'modified_timestamp': ('updated_at', T(parse_iso8601)), + })), traverse_obj(rss_metadata, ('content', { + 'title': ('episode_title', T(str_or_none)), + 'description': ('episode_description_plain', T(str_or_none)), + 'episode_id': ('episode_id', T(str_or_none)), + 'episode_number': ('episode_number', T(int_or_none)), + 'season': ('season_title', T(str_or_none)), + 'season_id': ('season_id', T(str_or_none)), + 'season_number': ('season_number', T(int_or_none)), + 'series': ('series_title', T(str_or_none)), + 'series_id': ('series_id', T(str_or_none)), + 'thumbnail': ('cover_image', T(url_or_none)), + 'duration': ('duration', T(float_or_none)), + })), rev=True) + + +class Art19ShowIE(InfoExtractor): + IE_DESC = 'Art19 series' + _VALID_URL_BASE = r'https?://(?:www\.)?art19\.com/shows/(?P[\w-]+)(?:/embed)?/?' + _VALID_URL = ( + r'{0}(?:$|[#?])'.format(_VALID_URL_BASE), + r'https?://rss\.art19\.com/(?P[\w-]+)/?(?:$|[#?])', + ) + _EMBED_REGEX = (r']+\bsrc=[\'"](?P{0}[^\'"])'.format(_VALID_URL_BASE),) + + _TESTS = [{ + 'url': 'https://www.art19.com/shows/5898c087-a14f-48dc-b6fc-a2280a1ff6e0/', + 'info_dict': { + '_type': 'playlist', + 'id': '5898c087-a14f-48dc-b6fc-a2280a1ff6e0', + 'display_id': 'echt-gebeurd', + 'title': 'Echt Gebeurd', + 'description': r're:(?us)Bij\sEcht Gebeurd\svertellen mensen .{1166} Eline Veldhuisen\.$', + 'timestamp': 1492642167, + # 'upload_date': '20170419', + 'modified_timestamp': int, + # 'modified_date': str, + 'tags': 'count:7', + }, + 'playlist_mincount': 425, + }, { + 'url': 'https://rss.art19.com/scamfluencers', + 'info_dict': { + '_type': 'playlist', + 'id': 'd3c9b8ca-26b3-42f4-9bd8-21d1a9031e75', + 'display_id': 'scamfluencers', + 'title': 'Scamfluencers', + 'description': r're:(?s)You never really know someone\b.{1078} wondery\.com/links/scamfluencers/ now\.$', + 'timestamp': 1647368573, + # 'upload_date': '20220315', + 'modified_timestamp': int, + # 'modified_date': str, + 'tags': [], + }, + 'playlist_mincount': 90, + }, { + 'url': 'https://art19.com/shows/enthuellt/embed', + 'info_dict': { + '_type': 'playlist', + 'id': 'e2cacf57-bb8a-4263-aa81-719bcdd4f80c', + 'display_id': 'enthuellt', + 'title': 'Enthüllt', + 'description': 'md5:17752246643414a2fd51744fc9a1c08e', + 'timestamp': 1601645860, + # 'upload_date': '20201002', + 'modified_timestamp': int, + # 'modified_date': str, + 'tags': 'count:10', + }, + 'playlist_mincount': 10, + 'skip': 'Content not found', + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://deconstructingyourself.com/deconstructing-yourself-podcast', + 'info_dict': { + '_type': 'playlist', + 'id': 'cfbb9b01-c295-4adb-8726-adde7c03cf21', + 'display_id': 'deconstructing-yourself', + 'title': 'Deconstructing Yourself', + 'description': 'md5:dab5082b28b248a35476abf64768854d', + 'timestamp': 1570581181, + # 'upload_date': '20191009', + 'modified_timestamp': int, + # 'modified_date': str, + 'tags': 'count:5', + }, + 'playlist_mincount': 80, + }, { + 'url': 'https://chicagoreader.com/columns-opinion/podcasts/ben-joravsky-show-podcast-episodes/', + 'info_dict': { + '_type': 'playlist', + 'id': '9dfa2c37-ab87-4c13-8388-4897914313ec', + 'display_id': 'the-ben-joravsky-show', + 'title': 'The Ben Joravsky Show', + 'description': 'md5:c0f3ec0ee0dbea764390e521adc8780a', + 'timestamp': 1550875095, + # 'upload_date': '20190222', + 'modified_timestamp': int, + # 'modified_date': str, + 'tags': ['Chicago Politics', 'chicago', 'Ben Joravsky'], + }, + 'playlist_mincount': 1900, + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + for from_ in super(Art19ShowIE, cls)._extract_embed_urls(url, webpage): + yield from_ + for series_id in re.findall( + r']+\bclass=[\'"][^\'"]*art19-web-player[^\'"]*[\'"][^>]+\bdata-series-id=[\'"]([\w-]+)[\'"]', webpage): + yield 'https://art19.com/shows/{0}'.format(series_id) + + def _real_extract(self, url): + series_id = self._match_id(url) + for expected in ((403, 404), None): + series_metadata, urlh = self._download_json_handle( + 'https://art19.com/series/{0}'.format(series_id), series_id, note='Downloading series metadata', + headers={'Accept': 'application/vnd.art19.v0+json'}, + expected_status=(403, 404)) + if urlh.getcode() == 403: + # raise the actual problem with the page + urlh = self._request_webpage(url, series_id, expected_status=404) + if urlh.getcode() == 404: + raise ExtractorError( + 'content not found, possibly expired', + video_id=series_id, expected=True) + if urlh.getcode() not in (expected or []): + # apparently OK + break + + return merge_dicts( + self.playlist_result(( + self.url_result('https://rss.art19.com/episodes/{0}.mp3'.format(episode_id), Art19IE) + for episode_id in traverse_obj(series_metadata, ('series', 'episode_ids', Ellipsis, T(str_or_none))))), + traverse_obj(series_metadata, ('series', { + 'id': ('id', T(str_or_none)), + 'display_id': ('slug', T(str_or_none)), + 'title': ('title', T(str_or_none)), + 'description': ('description_plain', T(str_or_none)), + 'timestamp': ('created_at', T(parse_iso8601)), + 'modified_timestamp': ('updated_at', T(parse_iso8601)), + })), + traverse_obj(series_metadata, { + 'tags': ('tags', Ellipsis, 'name', T(str_or_none)), + }, {'tags': T(lambda _: [])})) From 79abdae7349578ca3a8ae16f83868801fc9d8563 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 23 Nov 2024 10:47:21 +0000 Subject: [PATCH 16/19] Add Art19IE to extractors.py And clean up sorting --- youtube_dl/extractor/extractors.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7c7b880d7b9..e7341e0c054 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -71,13 +71,17 @@ ARDIE, ARDMediathekIE, ) +from .art19 import ( + Art19IE, + Art19ShowIE, +) +from .arnes import ArnesIE from .arte import ( ArteTVIE, ArteTVEmbedIE, ArteTVPlaylistIE, ArteTVCategoryIE, ) -from .arnes import ArnesIE from .asiancrush import ( AsianCrushIE, AsianCrushPlaylistIE, @@ -783,7 +787,6 @@ NebulaSubscriptionsIE, ) from .nerdcubed import NerdCubedFeedIE -from .netzkino import NetzkinoIE from .neteasemusic import ( NetEaseMusicIE, NetEaseMusicAlbumIE, @@ -793,6 +796,7 @@ NetEaseMusicProgramIE, NetEaseMusicDjRadioIE, ) +from .netzkino import NetzkinoIE from .newgrounds import ( NewgroundsIE, NewgroundsPlaylistIE, From bd4729a866ad93cf95d7917ba02d05c8dc49269f Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 23 Nov 2024 11:00:00 +0000 Subject: [PATCH 17/19] [utils] Add json_stringify() * somewhat like JSON.stringify() * replaces json.dumps(..., separators=(',',':')).encode('utf-8') * more kwarg options available --- youtube_dl/utils.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ac1e78002b3..90bb40bffaa 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -6715,3 +6715,45 @@ def sanitize_extension(cls, extension, **kwargs): raise cls(extension) return extension + +def json_stringify(json_expr, **kwargs): + # /, *, concise=True, result_encoding='utf-8', **{**encode_result_kwargs, **dumps_kwargs} + """ + Convert json_expr to a string, suitable for passing over a network + + @param json_expr Python representation of a JSON expression + + KW-only parameters + @param {bool} concise do not space around , and : (default True) + @param {str} result_encoding encoding, if any, of the result + (default 'utf-8') + @param {str} errors error handling for result_encoding + @param ... other KW arguments [assed to json.dumps() + @returns {bytes|str} stringified JSON, encoded to bytes using + result_encoding, or Unicode if none + + With the default arguments, the return value is a byte string + suitable to be passed as POST data. + + Inspired by JSON.stringify [1], but not so much as to emulate its optional + replacer (use cls=replacer_JSON_encoder) or space (use indent=space for space > 0). + 1. https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON/stringify + """ + + # extract all non-dumps_kwargs + concise = kwargs.pop('concise', True) + result_encoding = kwargs.pop('result_encoding', 'utf-8') + result_errors = kwargs.pop('errors', None) + + if concise: + kwargs['separators'] = (',', ':') + kwargs = compat_kwargs(kwargs) + result = json.dumps(json_expr, **kwargs) + + if result_encoding: + kwargs = compat_kwargs({'errors': result_errors}) if result_errors else {} + result = result.encode(result_encoding, **kwargs) + return result + + # return a Unicode value of type type('') + return '' + result From 92d881c33fbafdb4875b7424ad789f116e57137d Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 23 Nov 2024 11:03:37 +0000 Subject: [PATCH 18/19] Linty --- youtube_dl/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 90bb40bffaa..3b2ec397cd4 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -6716,6 +6716,7 @@ def sanitize_extension(cls, extension, **kwargs): return extension + def json_stringify(json_expr, **kwargs): # /, *, concise=True, result_encoding='utf-8', **{**encode_result_kwargs, **dumps_kwargs} """ From d4664a53467eade879971de3950424299f68bd3a Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 23 Nov 2024 11:14:30 +0000 Subject: [PATCH 19/19] Remove (last?) set literal --- youtube_dl/extractor/nebula.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py index 3e55db9391d..7778eab70dd 100644 --- a/youtube_dl/extractor/nebula.py +++ b/youtube_dl/extractor/nebula.py @@ -136,7 +136,8 @@ def _extract_video_metadata(self, episode): 'series': 'channel_title', 'creator': 'channel_title', 'thumbnail': ('images', 'thumbnail', 'src', T(url_or_none)), - 'episode_number': ('order', {int_or_none}), + 'episode_number': ('order', T(int_or_none)), + # Old code was wrongly setting extractor_key from NebulaSubscriptionsIE # '_old_archive_ids': ('zype_id', {lambda x: [ # make_archive_id(NebulaIE, x), make_archive_id(NebulaSubscriptionsIE, x)] if x else None}),