diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index 6e8fc3961b7..34c374a0fcf 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -1249,6 +1249,7 @@
from .tiktok import (
TikTokIE,
TikTokUserIE,
+ TikTokVMIE,
)
from .tinypic import TinyPicIE
from .tmz import (
diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py
index 99a0c07d7e0..49df0844b85 100644
--- a/youtube_dl/extractor/tiktok.py
+++ b/youtube_dl/extractor/tiktok.py
@@ -1,15 +1,18 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
+from .generic import GenericIE
from ..compat import (
- compat_str,
- compat_urllib_request,
+ compat_kwargs,
)
from ..utils import (
+ dict_get,
ExtractorError,
float_or_none,
- HEADRequest,
+ get_element_by_id,
int_or_none,
str_or_none,
try_get,
@@ -17,24 +20,37 @@
)
+# decorator enforces UA that TT doesn't block
+def vanilla_UA_request(func):
+
+ vanilla_UA = 'Mozilla/5.0'
+
+ def wrapped(*args, **kwargs):
+ headers = kwargs.get('headers', {})
+ if 'User-Agent' not in headers:
+ headers['User-Agent'] = vanilla_UA
+ kwargs.update({'headers': headers, })
+ kwargs = compat_kwargs(kwargs)
+ return func(*args, **kwargs)
+
+ return wrapped
+
+
class TikTokBaseIE(InfoExtractor):
- def _download_webpage(
- self, url_or_request, video_id, note=None, errnote=None,
- fatal=True, tries=1, timeout=5, encoding=None, data=None,
- headers={}, query={}, expected_status=None, setup=True):
-
- if setup:
- url = url_or_request.geturl() if isinstance(url_or_request, compat_urllib_request.Request) else url_or_request
- # dummy request to set cookies
- self._request_webpage(
- HEADRequest(url), video_id,
- note=False, errnote='Could not send HEAD request to %s' % url,
- fatal=False, headers=headers)
-
- return super(TikTokBaseIE, self)._download_webpage(
- url_or_request, video_id, note=note, errnote=errnote,
- fatal=fatal, tries=tries, timeout=timeout, encoding=encoding, data=data,
- headers=headers, query=query, expected_status=expected_status)
+ IE_DESC = 'Abstract base for TikTok extractors'
+ IE_NAME = 'tiktok:base'
+
+ @vanilla_UA_request
+ def _request_webpage(self, *args, **kwargs):
+ return super(TikTokBaseIE, self)._request_webpage(*args, **kwargs)
+
+ def _get_SIGI_STATE(self, video_id, html):
+ state = self._parse_json(
+ get_element_by_id('SIGI_STATE', html)
+ or self._search_regex(
+ r'''(?s)\d+)'
+ IE_DESC = 'TikTok video extractor'
+ IE_NAME = 'tiktok'
+ _VALID_URL = r'(?:https?://(?:(?:www|m)\.)?tiktok\.com/@[^/]+/video/|tiktok:(?P[^/?#&]+):)(?P\d+)'
_TESTS = [{
'url': 'https://www.tiktok.com/@zureeal/video/6606727368545406213',
'md5': '163ceff303bb52de60e6887fe399e6cd',
'info_dict': {
'id': '6606727368545406213',
'ext': 'mp4',
- 'title': 'md5:24acc456b62b938a7e2dd88e978b20d9',
+ 'title': 'md5:363e08ccb6c691314710429f379bffe5',
'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay',
'thumbnail': r're:^https?://.*',
'duration': 15,
- 'uploader': 'md5:24acc456b62b938a7e2dd88e978b20d9',
+ 'uploader': 'md5:363e08ccb6c691314710429f379bffe5',
'uploader_id': '188294915489964032',
'timestamp': 1538248586,
'upload_date': '20180929',
@@ -124,78 +142,85 @@ class TikTokIE(TikTokBaseIE):
}
}]
+ def _real_initialize(self):
+ # Setup session (will set necessary cookies)
+ self._request_webpage(
+ 'https://www.tiktok.com/', None, note='Setting up session')
+
def _real_extract(self, url):
- video_id = self._match_id(url)
+ m = re.match(self._VALID_URL, url).groupdict()
+ video_id = m['id']
+ if 'user_id' in m:
+ url = 'https://www.tiktok.com/@%(user_id)s/video/%(id)s/' % m
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(url.replace('://m.', '://www.'), video_id)
- page_props = self._parse_json(self._search_regex(
- r'''(?s)]+\bid=["\']__NEXT_DATA__[^>]+>\s*({.+?})\s*[^/?#&]+)'
+class TikTokUserIE(TikTokIE):
+ IE_DESC = 'TikTok user profile extractor'
+ IE_NAME = 'tiktok:user'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)?tiktok\.com/@(?P[^/?#&]+)'
_TESTS = [{
'url': 'https://www.tiktok.com/@zureeal',
'info_dict': {
'id': '188294915489964032',
},
- 'playlist_mincount': 24,
+ 'playlist_mincount': 30,
+ 'expected_warnings': [
+ 'More videos are available',
+ ],
}]
@classmethod
def suitable(cls, url):
- return False if TikTokIE.suitable(url) else super(TikTokUserIE, cls).suitable(url)
+ return False if TikTokIE.suitable(url) else super(TikTokBaseIE, cls).suitable(url)
def _real_extract(self, url):
user_id = self._match_id(url)
- webpage = self._download_webpage(url, user_id)
- page_props = self._parse_json(self._search_regex(
- r'''(?s)[^/?#&.]+)'
+ _TESTS = [{
+ 'url': 'https://vm.tiktok.com/ZMLesneqK/',
+ 'info_dict': {
+ 'id': '7054218882072055046',
+ 'ext': 'mp4',
+ 'title': 'EddY',
+ 'upload_date': '20220117',
+ 'description': 'Hilft bestimmt gegen nervige Anrufer! 😂 #telefon #call #prank #fail #sprecher #stimme #voice #band #ansage #sound #comedy #unterhaltung #scammer #fy',
+ 'timestamp': 1642438324,
+ 'uploader': 'EddY',
+ 'uploader_id': '6850021004246467590',
+ },
+ }]
+
+ @vanilla_UA_request
+ def _request_webpage(self, *args, **kwargs):
+ return super(GenericIE, self)._request_webpage(*args, **kwargs)