Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[extractor/sbs] Overhaul extractor for new APIs #31880

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
246 changes: 235 additions & 11 deletions youtube_dl/extractor/sbs.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,66 @@
# coding: utf-8
from __future__ import unicode_literals

import sys

from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
compat_kwargs,
compat_str,
)
from ..utils import (
smuggle_url,
error_to_compat_str,
ExtractorError,
float_or_none,
GeoRestrictedError,
int_or_none,
parse_duration,
parse_iso8601,
smuggle_url,
traverse_obj,
update_url_query,
url_or_none,
variadic,
)


class SBSIE(InfoExtractor):
IE_DESC = 'sbs.com.au'
_VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=|/watch/)|news/(?:embeds/)?video/)(?P<id>[0-9]+)'
_VALID_URL = r'''(?x)
https?://(?:www\.)?sbs\.com\.au/(?:
ondemand(?:
/video/(?:single/)?|
/movie/[^/]+/|
Copy link

@vidiot720 vidiot720 Apr 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adjustment needed to match single video programs (e.g. documentaries such as https://www.sbs.com.au/ondemand/tv-program/autun-romes-forgotten-sister/2116212803602),
like so:

Suggested change
/movie/[^/]+/|
/(?:tv-program|movie)/[^/]+/|

/(?:tv|news)-series/(?:[^/]+/){3}|
.*?\bplay=|/watch/
)|news/(?:embeds/)?video/
)(?P<id>[0-9]+)'''
_EMBED_REGEX = [r'''(?x)]
(?:
<meta\s+property="og:video"\s+content=|
<iframe[^>]+?src=
)
(["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''']

_TESTS = [{
# Original URL is handled by the generic IE which finds the iframe:
# Exceptional unrestricted show for testing, thanks SBS,
dirkf marked this conversation as resolved.
Show resolved Hide resolved
# from an iframe of this page, handled by the generic IE, now 404:
# http://www.sbs.com.au/thefeed/blog/2014/08/21/dingo-conservation
dirkf marked this conversation as resolved.
Show resolved Hide resolved
dirkf marked this conversation as resolved.
Show resolved Hide resolved
'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed',
'md5': '3150cf278965eeabb5b4cea1c963fe0a',
'md5': 'e49d0290cb4f40d893b8dfe760dce6b0',
'info_dict': {
'id': '_rFBPRPO4pMR',
'id': '320403011771', # formerly '_rFBPRPO4pMR', no longer found
dirkf marked this conversation as resolved.
Show resolved Hide resolved
'ext': 'mp4',
'title': 'Dingo Conservation (The Feed)',
'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5',
'thumbnail': r're:http://.*\.jpg',
'thumbnail': r're:https?://.*\.jpg',
'duration': 308,
'timestamp': 1408613220,
'upload_date': '20140821',
'uploader': 'SBSC',
},
'expected_warnings': ['Unable to download JSON metadata'],
}, {
'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed',
'only_matching': True,
Expand All @@ -46,9 +79,197 @@ class SBSIE(InfoExtractor):
}, {
'url': 'https://www.sbs.com.au/ondemand/watch/1698704451971',
'only_matching': True,
}, {
'url': 'https://www.sbs.com.au/ondemand/movie/coherence/1469404227931',
'only_matching': True,
}, {
'note': 'Live stream',
'url': 'https://www.sbs.com.au/ondemand/video/1726824003663/sbs-24x7-live-stream-nsw',
'only_matching': True,
}, {
'url': 'https://www.sbs.com.au/ondemand/news-series/dateline/dateline-2022/dateline-s2022-ep26/2072245827515',
'only_matching': True,
}, {
'url': 'https://www.sbs.com.au/ondemand/tv-series/the-handmaids-tale/season-5/the-handmaids-tale-s5-ep1/2065631811776',
'only_matching': True,
dirkf marked this conversation as resolved.
Show resolved Hide resolved
}]

def __handle_request_webpage_error(self, err, video_id=None, errnote=None, fatal=True):
if errnote is False:
return False
if errnote is None:
errnote = 'Unable to download webpage'

errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
if fatal:
raise ExtractorError(errmsg, sys.exc_info()[2], cause=err, video_id=video_id)
else:
self._downloader.report_warning(errmsg)
return False

def _download_webpage_handle(self, url, video_id, *args, **kwargs):
# note, errnote, fatal, encoding, data, headers, query, expected_status
# specialised to detect geo-block

errnote = args[2] if len(args) > 2 else kwargs.get('errnote')
fatal = args[3] if len(args) > 3 else kwargs.get('fatal')
exp = args[7] if len(args) > 7 else kwargs.get('expected_status')

# add 403 to expected codes for interception
exp = variadic(exp or [], allowed_types=(compat_str, ))
if 403 not in exp and '403' not in exp:
exp = list(exp)
exp.append(403)
else:
exp = None

if exp:
if len(args) > 7:
args = list(args)
args[7] = exp
else:
kwargs['expected_status'] = exp
kwargs = compat_kwargs(kwargs)

Copy link
Contributor Author

@dirkf dirkf Mar 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Address #31880 (comment) (not required for yt-dlp or future yt-dl)

Suggested change
# for now, ensure that headers passed is not None
if len(args) > 5:
if args[5] is None:
# replace with empty dict
args = list(args)
args[5] = {}
elif kwargs.get('headers', False) is None:
# enforce callee default
del kwargs['headers']

ret = super(SBSIE, self)._download_webpage_handle(url, video_id, *args, **kwargs)
if ret is False:
return ret
webpage, urlh = ret

if urlh.getcode() == 403:
if urlh.headers.get('x-error-reason') == 'geo-blocked':
countries = ['AU']
if fatal:
self.raise_geo_restricted(countries=countries)
err = GeoRestrictedError(
'This Australian content is not available from your location due to geo restriction',
countries=countries)
else:
err = compat_HTTPError(urlh.geturl(), 403, 'HTTP Error 403: Forbidden', urlh.headers, urlh)
ret = self.__handle_request_webpage_error(err, video_id, errnote, fatal)
if exp:
# caller doesn't expect 403
return False

return ret

def _extract_m3u8_formats(self, m3u8_url, video_id, *args, **kwargs):
# ext, entry_protocol, preference, m3u8_id, note, errnote, fatal,
# live, data, headers, query
entry_protocol = args[1] if len(args) > 1 else kwargs.get('entry_protocol')
if not entry_protocol:
entry_protocol = 'm3u8_native'
if len(args) > 1:
args = list(args)
args[1] = entry_protocol
else:
kwargs['entry_protocol'] = entry_protocol
kwargs = compat_kwargs(kwargs)

return super(SBSIE, self)._extract_m3u8_formats(m3u8_url, video_id, *args, **kwargs)

AUS_TV_PARENTAL_GUIDELINES = {
'P': 0,
'C': 7,
'G': 0,
'PG': 0,
'M': 15,
'MA15+': 15,
'AV15+': 15,
'R18+': 18,
'NC': 0, # not classified (unofficial, used by SBS)
}
_PLAYER_API = 'https://www.sbs.com.au/api/v3'
_CATALOGUE_API = 'https://catalogue.pr.sbsod.com/'

def _call_api(self, video_id, path, query=None, data=None, headers=None, fatal=True):
return self._download_json(update_url_query(
self._CATALOGUE_API + path, query),
video_id, headers=headers, fatal=fatal) or {}

def _get_smil_url(self, video_id):
return update_url_query(
self._PLAYER_API + 'video_smil', {'id': video_id})

def _get_player_data(self, video_id, headers=None, fatal=False):
return self._download_json(update_url_query(
self._PLAYER_API + 'video_stream', {'id': video_id, 'context': 'tv'}),
video_id, headers=headers, fatal=fatal) or {}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe headers should default to an empty dict here and in _call_api. Setting headers to None results in options like --geo-bypass-country breaking.

Copy link
Contributor Author

@dirkf dirkf Mar 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You may have a point, ATM. In extractor/common.py ll.619ff.

--- git master
+++ WIP code pulling stuff from yt-dlp

         if self._x_forwarded_for_ip:
-            if 'X-Forwarded-For' not in headers:
-                headers['X-Forwarded-For'] = self._x_forwarded_for_ip
+            headers = (headers or {}).copy()
+            headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)

The new style is to avoid accidentally sharing mutable default values by passing None instead of {}. So we should for now have headers or {}. As _download_webpage_handle() is being specialised and is on the call path for all the extractor web requests, that would be a good place to tweak it.


def _real_extract(self, url):
video_id = self._match_id(url)
# get media links directly though later metadata may contain contentUrl
smil_url = self._get_smil_url(video_id)
formats = self._extract_smil_formats(smil_url, video_id, fatal=False) or []
self._sort_formats(formats)

# try for metadata from the same source
player_data = self._get_player_data(video_id, fatal=False)
media = traverse_obj(player_data, 'video_object', expected_type=dict) or {}
# get, or add, metadata from catalogue
media.update(self._call_api(video_id, 'mpx-media/' + video_id, fatal=not media))

# utils candidate for use with traverse_obj()
def txt_or_none(s):
return (s.strip() or None) if isinstance(s, compat_str) else None

# expected_type fn for thumbs
def xlate_thumb(t):
u = url_or_none(t.get('contentUrl'))
return u and {
'id': t.get('name'),
'url': u,
'width': int_or_none(t.get('width')),
'height': int_or_none(t.get('height')),
}

# may be numeric or timecoded
def really_parse_duration(d):
result = float_or_none(d)
if result is None:
result = parse_duration(d)
return result

def traverse_media(*args, **kwargs):
if 'expected_type' not in kwargs:
kwargs['expected_type'] = txt_or_none
kwargs = compat_kwargs(kwargs)
return traverse_obj(media, *args, **kwargs)

return {
'id': video_id,
'title': traverse_media(('displayTitles', Ellipsis, 'title'),
get_all=False) or media['title'],
Copy link

@vidiot720 vidiot720 Mar 19, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for these improvements to the metadata retrieval. One small issue; this change to title setting loses the episode title for tv episodes and only sets the series name; the old version used 'name' or similar, which could be parsed with title:(?P<series>.+?(?= S[0-9]+)) S(?P<season_number>[0-9]+) Ep(?P<episode_number>[0-9]+)(?P<episode>.*). As 'episode' not set directly, it gets the default Episode <episode number> instead.
The easy fix is to go back to using media['name'] to set the title, since the Series Title is set correctly anyway, but it's more complicated to pick-up the episode title as that appears as 'subtitle' within the 'displayTitles' node. But for movies, SBS uses subtitle field for genre (e.g. "Horror") so that would be incorrect as a general solution, unless the episode setting can be guarded by e.g. externalRelations.sbsondemand.entity_type == 'EPISODE', or by testing displayTitles.components.SeasonEpisodeFull == ''.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The extensive test suite (2 shows!) that I exercised included 1 show that was an episode, which had no specific episode name (Episode 6, or some such).

If the title or episode should be set to something like {series} S{season_number}E{episode_number} with any non-trivial episode value appended, that's fine. Show some examples.

Copy link

@vidiot720 vidiot720 Mar 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries, that explains it. My suggestions:
For title, use media['name'], because:

  • this is backwards compatible with the prior behaviour;
  • since title is used for the default output template, just setting to the series name isn't particularly helpful when dealing with multiple episodes of the same series.

If this is done, then title will take the form of {series} S{season_number} Ep{episode number} - {episode} if the episode is named, or {series} S{season_number} Ep{episode number}, if not. I've tested this change successfully, i.e. 'title': media['name'],. Since SBS build this string already, there's no assembly required for this. For non-episodic programs it also works fine (e.g. for a film, it's just the title of the film).

For episode, assuming this doesn't hold up landing this PR too much (since it's pretty urgent fix for breakage), use player_data['name'], (i.e. the name element of the VideoObject), iff partOfSeries node exists; sorry, not sure how this would be tested for so I haven't got proposed code for this. I think this is a reasonable way to determine whether the video is part of a series, and could therefore have an episode specific title. I wouldn't test for partOfSeason as some episodic programs have none (e.g. mini-series, or current affairs). I've suggested this element because unlike the videoDescriptions, it's blank when it should be (for episodes), per the examples below.

Examples:

Video id 2170811459503: title should be: From Dusk Till Dawn and episode empty.
Video id 2158330947797: title should be: Blackport S1 Ep8 - Iceland and episode should be Iceland.
Video id 2165730371885: title should be: Devils S1 Ep1 and episode empty.

For this last example, there may be a preference to set episode to Episode 1 rather than empty, as I think this is again more consistent with previous default behaviour. However, this is redundant given that the episode number is now being properly set, where provided, so I'd argue the episode field should be reserved for actual episode titles and not a fabricated generic string.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In re-jigging this PR for yt-dlp, discovered that the episode title is in the metadata from the catalogue API as title, and this has been added in the yt-dlp fix to the media object for later return as epName. Notwithstanding the traverse_obj differences, I expect this could be adapted for this PR:

        # For named episodes, use the catalogue's title to set episode, rather than generic 'Episode N'.
        if traverse_obj(media, ('partOfSeries', {dict})):
            media['epName'] = traverse_obj(media, ('title', {str}))`

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on discussion above, update for title:

Suggested change
'title': traverse_media(('displayTitles', Ellipsis, 'title'),
get_all=False) or media['title'],
'title': media['name'],

'formats': formats,
'description': traverse_media('description'),
'categories': traverse_media(
('genres', Ellipsis), ('taxonomy', ('genre', 'subgenre'), 'name')),
'tags': traverse_media(
(('consumerAdviceTexts', ('sbsSubCertification', 'consumerAdvice')), Ellipsis)),
'age_limit': self.AUS_TV_PARENTAL_GUIDELINES.get(traverse_media(
'classificationID', 'contentRating', default='').upper()),
'thumbnails': traverse_media(('thumbnails', Ellipsis),
expected_type=xlate_thumb),
'duration': traverse_media('duration',
expected_type=really_parse_duration),
'series': traverse_media(('partOfSeries', 'name'), 'seriesTitle'),
'series_id': traverse_media(('partOfSeries', 'uuid'), 'seriesID'),
'season_number': traverse_media(
(('partOfSeries', None), 'seasonNumber'),
expected_type=int_or_none, get_all=False),
'episode_number': traverse_media('episodeNumber',
expected_type=int_or_none),
'release_year': traverse_media('releaseYear',
expected_type=int_or_none),
'timestamp': traverse_media(
'datePublished', ('publication', 'startDate'),
expected_type=parse_iso8601),
'channel': traverse_media(('taxonomy', 'channel', 'name')),
'uploader': 'SBSC',
dirkf marked this conversation as resolved.
Show resolved Hide resolved
}

# just come behind the shed with me, mate
def _old_real_extract(self, url):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def _old_real_extract(self, url):
def _old_real_extract(self, url): //disabled

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It'll be removed before merging.

video_id = self._match_id(url)
player_params = self._download_json(
'http://www.sbs.com.au/api/video_pdkvars/id/%s?form=json' % video_id, video_id)
Expand All @@ -66,13 +287,16 @@ def _real_extract(self, url):
error_message = 'Sorry, %s is no longer available.' % video_data.get('title', '')
raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True)

urls = player_params['releaseUrls']
theplatform_url = (urls.get('progressive') or urls.get('html')
or urls.get('standard') or player_params['relatedItemsURL'])
media_url = traverse_obj(
player_params, ('releaseUrls', ('progressive', 'html', 'standard', 'htmlandroid')),
expected_type=url_or_none)
if not media_url:
raise ExtractorError('No', expected=True)

return {
'_type': 'url_transparent',
'ie_key': 'ThePlatform',
# 'ie_key': 'ThePlatform',
'id': video_id,
'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}),
'url': smuggle_url(self._proto_relative_url(media_url), {'force_smil_url': True}),
'is_live': player_params.get('streamType') == 'live',
}