[extractor] Simplify search extractors

2024-12-25 12:45:51 +01:00 · 2021-10-09 02:09:55 +05:30 · 2021-10-09 02:09:55 +05:30 · cc16383ff3
commit cc16383ff3
parent a903d8285c
6 changed files with 27 additions and 72 deletions
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -4,6 +4,7 @@
 import base64
 import datetime
 import hashlib
 import itertools
 import json
 import netrc
 import os
@ -3617,7 +3618,14 @@ def _real_extract(self, query):
            return self._get_n_results(query, n)
    def _get_n_results(self, query, n):
-        """Get a specified number of results for a query"""
+        """Get a specified number of results for a query.
        Either this function or _search_results must be overridden by subclasses """
        return self.playlist_result(
            itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
            query, query)
    def _search_results(self, query):
        """Returns an iterator of search results"""
        raise NotImplementedError('This method must be implemented by subclasses')
    @property
--- a/yt_dlp/extractor/googlesearch.py
+++ b/yt_dlp/extractor/googlesearch.py
@ -11,6 +11,7 @@ class GoogleSearchIE(SearchInfoExtractor):
    _MAX_RESULTS = 1000
    IE_NAME = 'video.google:search'
    _SEARCH_KEY = 'gvsearch'
    _WORKING = False
    _TEST = {
        'url': 'gvsearch15:python language',
        'info_dict': {
@ -20,16 +21,7 @@ class GoogleSearchIE(SearchInfoExtractor):
        'playlist_count': 15,
    }
-    def _get_n_results(self, query, n):
+    def _search_results(self, query):
        """Get a specified number of results for a query"""
        entries = []
        res = {
            '_type': 'playlist',
            'id': query,
            'title': query,
        }
        for pagenum in itertools.count():
            webpage = self._download_webpage(
                'http://www.google.com/search',
@ -44,16 +36,8 @@ def _get_n_results(self, query, n):
            for hit_idx, mobj in enumerate(re.finditer(
                    r'<h3 class="r"><a href="([^"]+)"', webpage)):
                if re.search(f'id="vidthumb{hit_idx + 1}"', webpage):
                    yield self.url_result(mobj.group(1))
-                # Skip playlists
+            if not re.search(r'id="pnnext"', webpage):
-                if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage):
+                return
                    continue
                entries.append({
                    '_type': 'url',
                    'url': mobj.group(1)
                })
            if (len(entries) >= n) or not re.search(r'id="pnnext"', webpage):
                res['entries'] = entries[:n]
                return res
--- a/yt_dlp/extractor/niconico.py
+++ b/yt_dlp/extractor/niconico.py
@ -709,11 +709,9 @@ class NicovideoSearchIE(SearchInfoExtractor, NicovideoSearchURLIE):
    _SEARCH_KEY = 'nicosearch'
    _TESTS = []
-    def _get_n_results(self, query, n):
+    def _search_results(self, query):
-        entries = self._entries(self._proto_relative_url(f'//www.nicovideo.jp/search/{query}'), query)
+        return self._entries(
-        if n < float('inf'):
+            self._proto_relative_url(f'//www.nicovideo.jp/search/{query}'), query)
            entries = itertools.islice(entries, 0, n)
        return self.playlist_result(entries, query, query)
 class NicovideoSearchDateIE(NicovideoSearchIE):
--- a/yt_dlp/extractor/soundcloud.py
+++ b/yt_dlp/extractor/soundcloud.py
@ -880,30 +880,19 @@ def _get_collection(self, endpoint, collection_id, **query):
        })
        next_url = update_url_query(self._API_V2_BASE + endpoint, query)
        collected_results = 0
        for i in itertools.count(1):
            response = self._download_json(
-                next_url, collection_id, 'Downloading page {0}'.format(i),
+                next_url, collection_id, f'Downloading page {i}',
                'Unable to download API page', headers=self._HEADERS)
-            collection = response.get('collection', [])
+            for item in response.get('collection') or []:
-            if not collection:
+                if item:
                break
            collection = list(filter(bool, collection))
            collected_results += len(collection)
            for item in collection:
                    yield self.url_result(item['uri'], SoundcloudIE.ie_key())
            if not collection or collected_results >= limit:
                break
            next_url = response.get('next_href')
            if not next_url:
                break
    def _get_n_results(self, query, n):
        tracks = self._get_collection('search/tracks', query, limit=n, q=query)
-        return self.playlist_result(tracks, playlist_title=query)
+        return self.playlist_result(tracks, query, query)
--- a/yt_dlp/extractor/yahoo.py
+++ b/yt_dlp/extractor/yahoo.py
@ -334,30 +334,14 @@ class YahooSearchIE(SearchInfoExtractor):
    IE_NAME = 'screen.yahoo:search'
    _SEARCH_KEY = 'yvsearch'
-    def _get_n_results(self, query, n):
+    def _search_results(self, query):
        """Get a specified number of results for a query"""
        entries = []
        for pagenum in itertools.count(0):
            result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
            info = self._download_json(result_url, query,
                                       note='Downloading results page ' + str(pagenum + 1))
-            m = info['m']
+            yield from (self.url_result(result['rurl']) for result in info['results'])
-            results = info['results']
+            if info['m']['last'] >= info['m']['total'] - 1:
            for (i, r) in enumerate(results):
                if (pagenum * 30) + i >= n:
                break
                mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
                e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
                entries.append(e)
            if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):
                break
        return {
            '_type': 'playlist',
            'id': query,
            'entries': entries,
        }
 class YahooGyaOPlayerIE(InfoExtractor):
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -4615,11 +4615,10 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
    _SEARCH_PARAMS = None
    _TESTS = []
-    def _entries(self, query, n):
+    def _search_results(self, query):
        data = {'query': query}
        if self._SEARCH_PARAMS:
            data['params'] = self._SEARCH_PARAMS
        total = 0
        continuation = {}
        for page_num in itertools.count(1):
            data.update(continuation)
@ -4662,17 +4661,10 @@ def _entries(self, query, n):
                        continue
                    yield self._extract_video(video)
                    total += 1
                    if total == n:
                        return
            if not continuation:
                break
    def _get_n_results(self, query, n):
        """Get a specified number of results for a query"""
        return self.playlist_result(self._entries(query, n), query, query)
 class YoutubeSearchDateIE(YoutubeSearchIE):
    IE_NAME = YoutubeSearchIE.IE_NAME + ':date'