diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6e8fc3961b7..34c374a0fcf 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1249,6 +1249,7 @@ from .tiktok import ( TikTokIE, TikTokUserIE, + TikTokVMIE, ) from .tinypic import TinyPicIE from .tmz import ( diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 99a0c07d7e0..49df0844b85 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -1,15 +1,18 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from .generic import GenericIE from ..compat import ( - compat_str, - compat_urllib_request, + compat_kwargs, ) from ..utils import ( + dict_get, ExtractorError, float_or_none, - HEADRequest, + get_element_by_id, int_or_none, str_or_none, try_get, @@ -17,24 +20,37 @@ ) +# decorator enforces UA that TT doesn't block +def vanilla_UA_request(func): + + vanilla_UA = 'Mozilla/5.0' + + def wrapped(*args, **kwargs): + headers = kwargs.get('headers', {}) + if 'User-Agent' not in headers: + headers['User-Agent'] = vanilla_UA + kwargs.update({'headers': headers, }) + kwargs = compat_kwargs(kwargs) + return func(*args, **kwargs) + + return wrapped + + class TikTokBaseIE(InfoExtractor): - def _download_webpage( - self, url_or_request, video_id, note=None, errnote=None, - fatal=True, tries=1, timeout=5, encoding=None, data=None, - headers={}, query={}, expected_status=None, setup=True): - - if setup: - url = url_or_request.geturl() if isinstance(url_or_request, compat_urllib_request.Request) else url_or_request - # dummy request to set cookies - self._request_webpage( - HEADRequest(url), video_id, - note=False, errnote='Could not send HEAD request to %s' % url, - fatal=False, headers=headers) - - return super(TikTokBaseIE, self)._download_webpage( - url_or_request, video_id, note=note, errnote=errnote, - fatal=fatal, tries=tries, timeout=timeout, encoding=encoding, data=data, - headers=headers, query=query, expected_status=expected_status) + IE_DESC = 'Abstract base for TikTok extractors' + IE_NAME = 'tiktok:base' + + @vanilla_UA_request + def _request_webpage(self, *args, **kwargs): + return super(TikTokBaseIE, self)._request_webpage(*args, **kwargs) + + def _get_SIGI_STATE(self, video_id, html): + state = self._parse_json( + get_element_by_id('SIGI_STATE', html) + or self._search_regex( + r'''(?s)]*?\bid\s*=\s*(?P"|'|\b)sigi-persisted-data(?P=q)[^>]*>[^=]*=\s*(?P{.+?})\s*(?:;[^<]+)?\d+)' + IE_DESC = 'TikTok video extractor' + IE_NAME = 'tiktok' + _VALID_URL = r'(?:https?://(?:(?:www|m)\.)?tiktok\.com/@[^/]+/video/|tiktok:(?P[^/?#&]+):)(?P\d+)' _TESTS = [{ 'url': 'https://www.tiktok.com/@zureeal/video/6606727368545406213', 'md5': '163ceff303bb52de60e6887fe399e6cd', 'info_dict': { 'id': '6606727368545406213', 'ext': 'mp4', - 'title': 'md5:24acc456b62b938a7e2dd88e978b20d9', + 'title': 'md5:363e08ccb6c691314710429f379bffe5', 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', 'thumbnail': r're:^https?://.*', 'duration': 15, - 'uploader': 'md5:24acc456b62b938a7e2dd88e978b20d9', + 'uploader': 'md5:363e08ccb6c691314710429f379bffe5', 'uploader_id': '188294915489964032', 'timestamp': 1538248586, 'upload_date': '20180929', @@ -124,78 +142,85 @@ class TikTokIE(TikTokBaseIE): } }] + def _real_initialize(self): + # Setup session (will set necessary cookies) + self._request_webpage( + 'https://www.tiktok.com/', None, note='Setting up session') + def _real_extract(self, url): - video_id = self._match_id(url) + m = re.match(self._VALID_URL, url).groupdict() + video_id = m['id'] + if 'user_id' in m: + url = 'https://www.tiktok.com/@%(user_id)s/video/%(id)s/' % m - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url.replace('://m.', '://www.'), video_id) - page_props = self._parse_json(self._search_regex( - r'''(?s)]*?\bid\s*=\s*(?P"|'|\b)sigi-persisted-data(?P=q)[^>]*>[^=]*=\s*(?P{.+?})\s*(?:;[^<]+)?]+\bid=["\']__NEXT_DATA__[^>]+>\s*({.+?})\s*[^/?#&]+)' +class TikTokUserIE(TikTokIE): + IE_DESC = 'TikTok user profile extractor' + IE_NAME = 'tiktok:user' + _VALID_URL = r'https?://(?:(?:www|m)\.)?tiktok\.com/@(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.tiktok.com/@zureeal', 'info_dict': { 'id': '188294915489964032', }, - 'playlist_mincount': 24, + 'playlist_mincount': 30, + 'expected_warnings': [ + 'More videos are available', + ], }] @classmethod def suitable(cls, url): - return False if TikTokIE.suitable(url) else super(TikTokUserIE, cls).suitable(url) + return False if TikTokIE.suitable(url) else super(TikTokBaseIE, cls).suitable(url) def _real_extract(self, url): user_id = self._match_id(url) - webpage = self._download_webpage(url, user_id) - page_props = self._parse_json(self._search_regex( - r'''(?s)]*?\bid\s*=\s*(?P"|'|\b)sigi-persisted-data(?P=q)[^>]*>[^=]*=\s*(?P{.+?})\s*(?:;[^<]+)?[^/?#&.]+)' + _TESTS = [{ + 'url': 'https://vm.tiktok.com/ZMLesneqK/', + 'info_dict': { + 'id': '7054218882072055046', + 'ext': 'mp4', + 'title': 'EddY', + 'upload_date': '20220117', + 'description': 'Hilft bestimmt gegen nervige Anrufer! 😂 #telefon #call #prank #fail #sprecher #stimme #voice #band #ansage #sound #comedy #unterhaltung #scammer #fy', + 'timestamp': 1642438324, + 'uploader': 'EddY', + 'uploader_id': '6850021004246467590', + }, + }] + + @vanilla_UA_request + def _request_webpage(self, *args, **kwargs): + return super(GenericIE, self)._request_webpage(*args, **kwargs)