[motherless] Fixed the broken uploader_id in the extractor (#31243)

* Fixed the broken uploader_id in the extractor. * Make uploader_id RE looser * Fix uploader_id in test Motherless_3 * Fix group pagination * # coding: utf-8 Co-authored-by: Andy Xuming <xuminic@gmail.com> Co-authored-by: dirkf <fieldhouse@gmx.net>
ytdl-org · Oct 10, 2022 · 82e4eca · 82e4eca
1 parent 1b14428
commit 82e4eca
Showing 1 changed file with 7 additions and 6 deletions.
diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py
@@ -1,3 +1,4 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
 import datetime
@@ -71,7 +72,7 @@ class MotherlessIE(InfoExtractor):
             'title': 'a/ Hot Teens',
             'categories': list,
             'upload_date': '20210104',
-            'uploader_id': 'yonbiw',
+            'uploader_id': 'anonymous',
             'thumbnail': r're:https?://.*\.jpg',
             'age_limit': 18,
         },
@@ -127,7 +128,7 @@ def _real_extract(self, url):
 
         comment_count = webpage.count('class="media-comment-contents"')
         uploader_id = self._html_search_regex(
-            r'"thumb-member-username">\s+<a href="/m/([^"]+)"',
+            r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)''',
             webpage, 'uploader_id')
 
         categories = self._html_search_meta('keywords', webpage, default=None)
@@ -169,7 +170,7 @@ class MotherlessGroupIE(InfoExtractor):
             'description': 'Sex can be funny. Wide smiles,laugh, games, fun of '
                            'any kind!'
         },
-        'playlist_mincount': 9,
+        'playlist_mincount': 0,
     }]
 
     @classmethod
@@ -208,9 +209,9 @@ def _real_extract(self, url):
             r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False)
         description = self._html_search_meta(
             'description', webpage, fatal=False)
-        page_count = self._int(self._search_regex(
-            r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT',
-            webpage, 'page_count'), 'page_count')
+        page_count = str_to_int(self._search_regex(
+            r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b',
+            webpage, 'page_count', default='1'))
         PAGE_SIZE = 80
 
         def _get_page(idx):