From 416da574ec0df3388f652e44f7fe71b1e3a4701f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 23 Oct 2020 21:31:37 +0700 Subject: [PATCH 01/87] [ytsearch] Fix extraction (closes #26920) --- youtube_dl/extractor/youtube.py | 116 +++++++++++++++++++++----------- 1 file changed, 78 insertions(+), 38 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 02f3ab61a..bd1515380 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3181,54 +3181,94 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): _MAX_RESULTS = float('inf') IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' - _EXTRA_QUERY_ARGS = {} + _SEARCH_PARAMS = None _TESTS = [] + def _entries(self, query, n): + data = { + 'context': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20201021.03.00', + } + }, + 'query': query, + } + if self._SEARCH_PARAMS: + data['params'] = self._SEARCH_PARAMS + total = 0 + for page_num in itertools.count(1): + search = self._download_json( + 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + video_id='query "%s"' % query, + note='Downloading page %s' % page_num, + errnote='Unable to download API page', fatal=False, + data=json.dumps(data).encode('utf8'), + headers={'content-type': 'application/json'}) + if not search: + break + slr_contents = try_get( + search, + (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], + lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), + list) + if not slr_contents: + break + isr_contents = try_get( + slr_contents, + lambda x: x[0]['itemSectionRenderer']['contents'], + list) + if not isr_contents: + break + for content in isr_contents: + if not isinstance(content, dict): + continue + video = content.get('videoRenderer') + if not isinstance(video, dict): + continue + video_id = video.get('videoId') + if not video_id: + continue + title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str) + description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str) + duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str)) + view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or '' + view_count = int_or_none(self._search_regex( + r'^(\d+)', re.sub(r'\s', '', view_count_text), + 'view count', default=None)) + uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str) + total += 1 + yield { + '_type': 'url_transparent', + 'ie_key': YoutubeIE.ie_key(), + 'id': video_id, + 'url': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'uploader': uploader, + } + if total == n: + return + token = try_get( + slr_contents, + lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], + compat_str) + if not token: + break + data['continuation'] = token + def _get_n_results(self, query, n): """Get a specified number of results for a query""" - - videos = [] - limit = n - - url_query = { - 'search_query': query.encode('utf-8'), - } - url_query.update(self._EXTRA_QUERY_ARGS) - result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) - - for pagenum in itertools.count(1): - data = self._download_json( - result_url, video_id='query "%s"' % query, - note='Downloading page %s' % pagenum, - errnote='Unable to download API page', - query={'spf': 'navigate'}) - html_content = data[1]['body']['content'] - - if 'class="search-message' in html_content: - raise ExtractorError( - '[youtube] No video results', expected=True) - - new_videos = list(self._process_page(html_content)) - videos += new_videos - if not new_videos or len(videos) > limit: - break - next_link = self._html_search_regex( - r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next', - html_content, 'next link', default=None) - if next_link is None: - break - result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link) - - if len(videos) > n: - videos = videos[:n] - return self.playlist_result(videos, query) + return self.playlist_result(self._entries(query, n), query) class YoutubeSearchDateIE(YoutubeSearchIE): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' _SEARCH_KEY = 'ytsearchdate' IE_DESC = 'YouTube.com searches, newest videos first' - _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} + _SEARCH_PARAMS = 'CAI%3D' class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): From 6d4733ce7be2e5bb31d1a21f68c6123074d584a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Nov 2020 06:52:00 +0700 Subject: [PATCH 02/87] [youtube] Fix JS player URL extraction --- youtube_dl/extractor/youtube.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bd1515380..c31731ac0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2086,7 +2086,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if cipher: if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' + ASSETS_RE = ( + r']+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base', + r'"jsUrl"\s*:\s*("[^"]+")', + r'"assets":.+?"js":\s*("[^"]+")') jsplayer_url_json = self._search_regex( ASSETS_RE, embed_webpage if age_gate else video_webpage, From b9bceba37ca277261ba9cd37931036b3a52e3e07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Nov 2020 07:34:20 +0700 Subject: [PATCH 03/87] [ChangeLog] Actualize [ci skip] --- ChangeLog | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ChangeLog b/ChangeLog index 9b52b7bd2..7ef22edfc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version + +Core +* [utils] Don't attempt to coerce JS strings to numbers in js_to_json (#26851) +* [downloader/http] Properly handle missing message in SSLError (#26646) +* [downloader/http] Fix access to not yet opened stream in retry + +Extractors +* [youtube] Fix JS player URL extraction +* [ytsearch] Fix extraction (#26920) +* [afreecatv] Fix typo (#26970) +* [23video] Relax URL regular expression (#26870) ++ [ustream] Add support for video.ibm.com (#26894) +* [iqiyi] Fix typo (#26884) ++ [expressen] Add support for di.se (#26670) +* [iprima] Improve video id extraction (#26507, #26494) + + version 2020.09.20 Core From 34299510bbe077c08e57929d835d212487037e2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Nov 2020 08:52:27 +0700 Subject: [PATCH 04/87] release 2020.11.01 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index ce0319fe2..8bf96cb24 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.09.20** +- [ ] I've verified that I'm running youtube-dl version **2020.11.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.09.20 + [debug] youtube-dl version 2020.11.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index a4002603c..61c005e91 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.09.20** +- [ ] I've verified that I'm running youtube-dl version **2020.11.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 3f8b6ce2e..c7eabe344 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.09.20** +- [ ] I've verified that I'm running youtube-dl version **2020.11.01** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index d880c225a..446b82e64 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.09.20** +- [ ] I've verified that I'm running youtube-dl version **2020.11.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.09.20 + [debug] youtube-dl version 2020.11.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index dd5fb5144..69657448a 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.09.20** +- [ ] I've verified that I'm running youtube-dl version **2020.11.01** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 7ef22edfc..cfe4ab79d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.11.01 Core * [utils] Don't attempt to coerce JS strings to numbers in js_to_json (#26851) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 709e5c74c..7d89dd3e1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.09.20' +__version__ = '2020.11.01' From 051071203ccedcaf6d0de0f0d83af593c78673e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Nov 2020 08:58:40 +0700 Subject: [PATCH 05/87] release 2020.11.01.1 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- youtube_dl/version.py | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 8bf96cb24..846c7e2ab 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.11.01** +- [ ] I've verified that I'm running youtube-dl version **2020.11.01.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.01 + [debug] youtube-dl version 2020.11.01.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 61c005e91..26823fc9f 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.11.01** +- [ ] I've verified that I'm running youtube-dl version **2020.11.01.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index c7eabe344..3d3ba3182 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.01** +- [ ] I've verified that I'm running youtube-dl version **2020.11.01.1** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 446b82e64..288bae2d4 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.11.01** +- [ ] I've verified that I'm running youtube-dl version **2020.11.01.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.01 + [debug] youtube-dl version 2020.11.01.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 69657448a..77a195827 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.01** +- [ ] I've verified that I'm running youtube-dl version **2020.11.01.1** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7d89dd3e1..6d7f14717 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.11.01' +__version__ = '2020.11.01.1' From 2de2ca6659a18b6f5ab76565e00491153ae47276 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 12 Nov 2020 06:16:37 +0700 Subject: [PATCH 06/87] [youtube] Rework extractors WIP --- test/test_all_urls.py | 22 +- youtube_dl/extractor/extractors.py | 9 +- youtube_dl/extractor/youtube.py | 1105 ++++++++++++++-------------- 3 files changed, 566 insertions(+), 570 deletions(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 81056a999..348744028 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -34,13 +34,13 @@ class TestAllURLsMatching(unittest.TestCase): assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585 assertPlaylist('PL63F0C78739B09958') - assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') + # assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') - assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') + # assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M')) # Top tracks - assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101') + # assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101') def test_youtube_matching(self): self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M')) @@ -51,26 +51,22 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube']) def test_youtube_channel_matching(self): - assertChannel = lambda url: self.assertMatch(url, ['youtube:channel']) + assertChannel = lambda url: self.assertMatch(url, ['youtube:tab']) assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM') assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec') assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos') - def test_youtube_user_matching(self): - self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:user']) + # def test_youtube_user_matching(self): + # self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab']) def test_youtube_feeds(self): self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) - self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites']) - def test_youtube_show_matching(self): - self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show']) - - def test_youtube_search_matching(self): - self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) - self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) + # def test_youtube_search_matching(self): + # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) + # self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) def test_youtube_extract(self): assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ae7079a6a..9d7fecfe8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1474,21 +1474,18 @@ from .yourporn import YourPornIE from .yourupload import YourUploadIE from .youtube import ( YoutubeIE, - YoutubeChannelIE, - YoutubeFavouritesIE, YoutubeHistoryIE, YoutubeLiveIE, + YoutubeTabIE, YoutubePlaylistIE, - YoutubePlaylistsIE, YoutubeRecommendedIE, YoutubeSearchDateIE, YoutubeSearchIE, - YoutubeSearchURLIE, - YoutubeShowIE, + #YoutubeSearchURLIE, YoutubeSubscriptionsIE, YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, - YoutubeUserIE, + YoutubeYtUserIE, YoutubeWatchLaterIE, ) from .zapiks import ZapiksIE diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c31731ac0..696dec2c1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -16,8 +16,6 @@ from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, - compat_HTTPError, - compat_kwargs, compat_parse_qs, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, @@ -30,10 +28,8 @@ from ..utils import ( bool_or_none, clean_html, error_to_compat_str, - extract_attributes, ExtractorError, float_or_none, - get_element_by_attribute, get_element_by_id, int_or_none, mimetype2ext, @@ -52,6 +48,7 @@ from ..utils import ( uppercase_escape, url_or_none, urlencode_postdata, + urljoin, ) @@ -269,13 +266,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True - def _download_webpage_handle(self, *args, **kwargs): - query = kwargs.get('query', {}).copy() - query['disable_polymer'] = 'true' - kwargs['query'] = query - return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( - *args, **compat_kwargs(kwargs)) - def _real_initialize(self): if self._downloader is None: return @@ -283,93 +273,34 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if not self._login(): return + _DEFAULT_API_DATA = { + 'context': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20201021.03.00', + } + }, + } -class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): - # Extract entries from page with "Load more" button - def _entries(self, page, playlist_id): - more_widget_html = content_html = page - for page_num in itertools.count(1): - for entry in self._process_page(content_html): - yield entry + def _call_api(self, ep, query, video_id): + data = self._DEFAULT_API_DATA.copy() + data.update(query) - mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) - if not mobj: - break + response = self._download_json( + 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id, + note='Downloading API JSON', errnote='Unable to download API page', + data=json.dumps(data).encode('utf8'), + headers={'content-type': 'application/json'}, + query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'}) - count = 0 - retries = 3 - while count <= retries: - try: - # Downloading page may result in intermittent 5xx HTTP error - # that is usually worked around with a retry - more = self._download_json( - 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s%s' - % (page_num, ' (retry #%d)' % count if count else ''), - transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) - break - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): - count += 1 - if count <= retries: - continue - raise + return response - content_html = more['content_html'] - if not content_html.strip(): - # Some webpages show a "Load more" button but they don't - # have more videos - break - more_widget_html = more['load_more_widget_html'] - - -class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for video_id, video_title in self.extract_videos_from_page(content): - yield self.url_result(video_id, 'Youtube', video_id, video_title) - - def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page): - for mobj in re.finditer(video_re, page): - # The link with index 0 is not the first video of the playlist (not sure if still actual) - if 'index' in mobj.groupdict() and mobj.group('id') == '0': - continue - video_id = mobj.group('id') - video_title = unescapeHTML( - mobj.group('title')) if 'title' in mobj.groupdict() else None - if video_title: - video_title = video_title.strip() - if video_title == '► Play all': - video_title = None - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) - - -class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for playlist_id in orderedSet(re.findall( - r']+class="[^"]*yt-lockup-title[^"]*"[^>]*>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', - content)): - yield self.url_result( - 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - title = self._og_search_title(webpage, fatal=False) - return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title) + def _extract_yt_initial_data(self, video_id, webpage): + return self._parse_json( + self._search_regex( + r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;', + webpage, 'yt initial data'), + video_id) class YoutubeIE(YoutubeBaseInfoExtractor): @@ -430,7 +361,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) )? # all until now is optional -> you can pass the naked ID - ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID + (?P[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID (?!.*?\blist= (?: %(playlist_id)s| # combined list/video URLs are handled by the playlist IE @@ -1891,6 +1822,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): add_dash_mpd_pr(player_response) + if not video_info and not player_response: + player_response = extract_player_response( + self._search_regex( + r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage, + 'initial player response', default='{}'), + video_id) + def extract_unavailable_message(): messages = [] for tag, kind in (('h1', 'message'), ('div', 'submessage')): @@ -2564,7 +2502,502 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } -class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): +class YoutubeTabIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube.com tab' + _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|playlist\?.*?\blist=)(?P[^/?#&]+)' + IE_NAME = 'youtube:tab' + + _TESTS = [{ + # playlists, multipage + 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', + 'playlist_mincount': 94, + 'info_dict': { + 'id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'title': 'Игорь Клейнер - Playlists', + }, + }, { + # playlists, multipage, different order + 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', + 'playlist_mincount': 94, + 'info_dict': { + 'id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'title': 'Игорь Клейнер - Playlists', + }, + }, { + # playlists, singlepage + 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', + 'playlist_mincount': 4, + 'info_dict': { + 'id': 'ThirstForScience', + 'title': 'ThirstForScience', + } + }, { + 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', + 'only_matching': True, + }, { + # basic, single video playlist + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'info_dict': { + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'title': 'youtube-dl public playlist', + }, + 'playlist_count': 1, + }, { + # empty playlist + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'info_dict': { + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'title': 'youtube-dl empty playlist', + }, + 'playlist_count': 0, + }, { + # Home tab + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Home', + }, + 'playlist_mincount': 2, + }, { + # Videos tab + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Videos', + }, + 'playlist_mincount': 975, + }, { + # Videos tab, sorted by popular + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Videos', + }, + 'playlist_mincount': 199, + }, { + # Playlists tab + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Playlists', + }, + 'playlist_mincount': 17, + }, { + # Community tab + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Community', + }, + 'playlist_mincount': 18, + }, { + # Channels tab + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Channels', + }, + 'playlist_mincount': 138, + }, { + 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', + 'only_matching': True, + }, { + 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA', + 'only_matching': True, + }, { + 'url': 'https://music.youtube.com/channel/UCT-K0qO8z6NzWrywqefBPBQ', + 'only_matching': True, + }, { + 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', + 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', + 'info_dict': { + 'title': '29C3: Not my department', + 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', + 'uploader': 'Christiaan008', + 'uploader_id': 'ChRiStIaAn008', + }, + 'playlist_count': 96, + }, { + 'note': 'Large playlist', + 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', + 'info_dict': { + 'title': 'Uploads from Cauchemar', + 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', + 'uploader': 'Cauchemar', + 'uploader_id': 'Cauchemar89', + }, + 'playlist_mincount': 1123, + }, { + # even larger playlist, 8832 videos + 'url': 'http://www.youtube.com/user/NASAgovVideo/videos', + 'only_matching': True, + }, { + 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', + 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', + 'info_dict': { + 'title': 'Uploads from Interstellar Movie', + 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', + 'uploader': 'Interstellar Movie', + 'uploader_id': 'InterstellarMovie1', + }, + 'playlist_mincount': 21, + }, { + # https://github.com/ytdl-org/youtube-dl/issues/21844 + 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'info_dict': { + 'title': 'Data Analysis with Dr Mike Pound', + 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'uploader_id': 'Computerphile', + 'uploader': 'Computerphile', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if YoutubeLiveIE.suitable(url) else super( + YoutubeTabIE, cls).suitable(url) + + def _extract_channel_id(self, webpage): + channel_id = self._html_search_meta( + 'channelId', webpage, 'channel id', default=None) + if channel_id: + return channel_id + channel_url = self._html_search_meta( + ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url', + 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad', + 'twitter:app:url:googleplay'), webpage, 'channel url') + return self._search_regex( + r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+', + channel_url, 'channel id') + + @staticmethod + def _extract_grid_item_renderer(item): + for item_kind in ('Playlist', 'Video', 'Channel'): + renderer = item.get('grid%sRenderer' % item_kind) + if renderer: + return renderer + + def _extract_video(self, renderer): + video_id = renderer.get('videoId') + title = try_get( + renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + description = try_get( + renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'], + compat_str) + duration = parse_duration(try_get( + renderer, lambda x: x['lengthText']['simpleText'], compat_str)) + view_count_text = try_get( + renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or '' + view_count = int_or_none(self._search_regex( + r'^(\d+)', re.sub(r'\s', '', view_count_text), + 'view count', default=None)) + uploader = try_get( + renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str) + return { + '_type': 'url_transparent', + 'ie_key': YoutubeIE.ie_key(), + 'id': video_id, + 'url': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'uploader': uploader, + } + + def _grid_entries(self, grid_renderer): + for item in grid_renderer['items']: + if not isinstance(item, dict): + continue + renderer = self._extract_grid_item_renderer(item) + if not isinstance(renderer, dict): + continue + title = try_get( + renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + # playlist + playlist_id = renderer.get('playlistId') + if playlist_id: + yield self.url_result( + 'https://www.youtube.com/playlist?list=%s' % playlist_id, + ie=YoutubeTabIE.ie_key(), video_id=playlist_id, + video_title=title) + # video + video_id = renderer.get('videoId') + if video_id: + yield self._extract_video(renderer) + # channel + channel_id = renderer.get('channelId') + if channel_id: + title = try_get( + renderer, lambda x: x['title']['simpleText'], compat_str) + yield self.url_result( + 'https://www.youtube.com/channel/%s' % channel_id, + ie=YoutubeTabIE.ie_key(), video_title=title) + + def _shelf_entries_trimmed(self, shelf_renderer): + renderer = try_get( + shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict) + if not renderer: + return + # TODO: add support for nested playlists so each shelf is processed + # as separate playlist + # TODO: this includes only first N items + for entry in self._grid_entries(renderer): + yield entry + + def _shelf_entries(self, shelf_renderer): + ep = try_get( + shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], + compat_str) + shelf_url = urljoin('https://www.youtube.com', ep) + if not shelf_url: + return + title = try_get( + shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + yield self.url_result(shelf_url, video_title=title) + + def _playlist_entries(self, video_list_renderer): + for content in video_list_renderer['contents']: + if not isinstance(content, dict): + continue + renderer = content.get('playlistVideoRenderer') + if not isinstance(renderer, dict): + continue + video_id = renderer.get('videoId') + if not video_id: + continue + yield self._extract_video(renderer) + + def _video_entry(self, video_renderer): + video_id = video_renderer.get('videoId') + if video_id: + return self._extract_video(video_renderer) + + def _post_thread_entries(self, post_thread_renderer): + post_renderer = try_get( + post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict) + if not post_renderer: + return + # video attachment + video_renderer = try_get( + post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) + video_id = None + if video_renderer: + entry = self._video_entry(video_renderer) + if entry: + yield entry + # inline video links + runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or [] + for run in runs: + if not isinstance(run, dict): + continue + ep_url = try_get( + run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str) + if not ep_url: + continue + if not YoutubeIE.suitable(ep_url): + continue + ep_video_id = YoutubeIE._match_id(ep_url) + if video_id == ep_video_id: + continue + yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id) + + def _post_thread_continuation_entries(self, post_thread_continuation): + contents = post_thread_continuation.get('contents') + if not isinstance(contents, list): + return + for content in contents: + renderer = content.get('backstagePostThreadRenderer') + if not isinstance(renderer, dict): + continue + for entry in self._post_thread_entries(renderer): + yield entry + + @staticmethod + def _extract_next_continuation_data(renderer): + next_continuation = try_get( + renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict) + if not next_continuation: + return + continuation = next_continuation.get('continuation') + if not continuation: + return + ctp = next_continuation.get('clickTrackingParams') + return { + 'ctoken': continuation, + 'continuation': continuation, + 'itct': ctp, + } + + @classmethod + def _extract_continuation(cls, renderer): + next_continuation = cls._extract_next_continuation_data(renderer) + if next_continuation: + return next_continuation + contents = renderer.get('contents') + if not isinstance(contents, list): + return + for content in contents: + if not isinstance(content, dict): + continue + continuation_ep = try_get( + content, lambda x: x['continuationItemRenderer']['continuationEndpoint'], + dict) + if not continuation_ep: + continue + continuation = try_get( + continuation_ep, lambda x: x['continuationCommand']['token'], compat_str) + if not continuation: + continue + ctp = continuation_ep.get('clickTrackingParams') + if not ctp: + continue + return { + 'ctoken': continuation, + 'continuation': continuation, + 'itct': ctp, + } + + def _entries(self, tab): + continuation = None + slr_contents = tab['sectionListRenderer']['contents'] + for slr_content in slr_contents: + if not isinstance(slr_content, dict): + continue + is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) + if not is_renderer: + continue + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] + for isr_content in isr_contents: + if not isinstance(isr_content, dict): + continue + renderer = isr_content.get('playlistVideoListRenderer') + if renderer: + for entry in self._playlist_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('gridRenderer') + if renderer: + for entry in self._grid_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('shelfRenderer') + if renderer: + for entry in self._shelf_entries(renderer): + yield entry + continue + renderer = isr_content.get('backstagePostThreadRenderer') + if renderer: + for entry in self._post_thread_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('videoRenderer') + if renderer: + entry = self._video_entry(renderer) + if entry: + yield entry + + if not continuation: + continuation = self._extract_continuation(is_renderer) + + for page_num in itertools.count(1): + if not continuation: + break + browse = self._download_json( + 'https://www.youtube.com/browse_ajax', None, + 'Downloading page %d' % page_num, + headers={ + 'x-youtube-client-name': '1', + 'x-youtube-client-version': '2.20201030.01.00', + }, query=continuation, fatal=False) + if not browse: + break + response = try_get(browse, lambda x: x[1]['response'], dict) + if not response: + break + + continuation_contents = try_get( + response, lambda x: x['continuationContents'], dict) + if continuation_contents: + continuation_renderer = continuation_contents.get('playlistVideoListContinuation') + if continuation_renderer: + for entry in self._playlist_entries(continuation_renderer): + yield entry + continuation = self._extract_continuation(continuation_renderer) + continue + continuation_renderer = continuation_contents.get('gridContinuation') + if continuation_renderer: + for entry in self._grid_entries(continuation_renderer): + yield entry + continuation = self._extract_continuation(continuation_renderer) + continue + continuation_renderer = continuation_contents.get('itemSectionContinuation') + if continuation_renderer: + for entry in self._post_thread_continuation_entries(continuation_renderer): + yield entry + continuation = self._extract_continuation(continuation_renderer) + continue + + continuation_items = try_get( + response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list) + if continuation_items: + continuation_item = continuation_items[0] + if not isinstance(continuation_item, dict): + continue + renderer = continuation_item.get('playlistVideoRenderer') + if renderer: + video_list_renderer = {'contents': continuation_items} + for entry in self._playlist_entries(video_list_renderer): + yield entry + continuation = self._extract_continuation(video_list_renderer) + continue + + break + + @staticmethod + def _extract_selected_tab(tabs): + for tab in tabs: + if try_get(tab, lambda x: x['tabRenderer']['selected'], bool): + return tab['tabRenderer'] + else: + raise ExtractorError('Unable to find selected tab') + + def _real_extract(self, url): + channel_id = self._match_id(url) + url = compat_urlparse.urlunparse( + compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) + webpage = self._download_webpage(url, channel_id) + data = self._extract_yt_initial_data(channel_id, webpage) + tabs = data['contents']['twoColumnBrowseResultsRenderer']['tabs'] + selected_tab = self._extract_selected_tab(tabs) + channel_title = try_get( + data, lambda x: x['metadata']['channelMetadataRenderer']['title'], + compat_str) + channel_external_id = try_get( + data, lambda x: x['metadata']['channelMetadataRenderer']['externalId'], + compat_str) + tab_title = selected_tab.get('title') + title = channel_title or channel_id + if tab_title: + title += ' - %s' % tab_title + return self.playlist_result( + self._entries(selected_tab['content']), + playlist_id=channel_external_id or channel_id, + playlist_title=title) + + +class YoutubePlaylistIE(InfoExtractor): IE_DESC = 'YouTube.com playlists' _VALID_URL = r"""(?x)(?: (?:https?://)? @@ -2591,39 +3024,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): | (%(playlist_id)s) )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P\d+))?(?:[^>]+>(?P[^<]+))?)?' - _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})' IE_NAME = 'youtube:playlist' _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', - 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'title': 'youtube-dl public playlist', - }, - 'playlist_count': 1, - }, { - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', - 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', - 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', - 'title': 'youtube-dl empty playlist', - }, - 'playlist_count': 0, - }, { - 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', - 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'info_dict': { - 'title': '29C3: Not my department', - 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'uploader': 'Christiaan008', - 'uploader_id': 'ChRiStIaAn008', - }, - 'playlist_count': 96, - }, { 'note': 'issue #673', 'url': 'PLBB231211A4F62143', 'info_dict': { @@ -2632,17 +3034,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'uploader': 'Wickydoo', 'uploader_id': 'Wickydoo', }, - 'playlist_mincount': 26, - }, { - 'note': 'Large playlist', - 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', - 'info_dict': { - 'title': 'Uploads from Cauchemar', - 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', - 'uploader': 'Cauchemar', - 'uploader_id': 'Cauchemar89', - }, - 'playlist_mincount': 799, + 'playlist_mincount': 29, }, { 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', 'info_dict': { @@ -2663,7 +3055,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): } }, { 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'playlist_mincount': 485, + 'playlist_mincount': 982, 'info_dict': { 'title': '2018 Chinese New Singles (11/6 updated)', 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', @@ -2679,16 +3071,6 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ', }, 'skip': 'This playlist does not exist', - }, { - 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', - 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', - 'info_dict': { - 'title': 'Uploads from Interstellar Movie', - 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', - 'uploader': 'Interstellar Movie', - 'uploader_id': 'InterstellarMovie1', - }, - 'playlist_mincount': 21, }, { # Playlist URL that does not actually serve a playlist 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', @@ -2733,16 +3115,6 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'noplaylist': True, 'skip_download': True, }, - }, { - # https://github.com/ytdl-org/youtube-dl/issues/21844 - 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'info_dict': { - 'title': 'Data Analysis with Dr Mike Pound', - 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'uploader_id': 'Computerphile', - 'uploader': 'Computerphile', - }, - 'playlist_mincount': 11, }, { 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', 'only_matching': True, @@ -2753,153 +3125,15 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): # music album playlist 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', 'only_matching': True, - }, { - 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', - 'only_matching': True, }, { 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', 'only_matching': True, }] - def _real_initialize(self): - self._login() - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - - for item in re.findall( - r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page): - attrs = extract_attributes(item) - video_id = attrs['data-video-id'] - video_title = unescapeHTML(attrs.get('data-title')) - if video_title: - video_title = video_title.strip() - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - # Fallback with old _VIDEO_RE - self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) - - # Relaxed fallbacks - self.extract_videos_from_page_impl( - r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page, - ids_in_page, titles_in_page) - self.extract_videos_from_page_impl( - r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page, - ids_in_page, titles_in_page) - - return zip(ids_in_page, titles_in_page) - - def _extract_mix(self, playlist_id): - # The mixes are generated from a single video - # the id of the playlist is just 'RD' + video_id - ids = [] - last_id = playlist_id[-11:] - for n in itertools.count(1): - url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) - webpage = self._download_webpage( - url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) - new_ids = orderedSet(re.findall( - r'''(?xs)data-video-username=".*?".*? - href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), - webpage)) - # Fetch new pages until all the videos are repeated, it seems that - # there are always 51 unique videos. - new_ids = [_id for _id in new_ids if _id not in ids] - if not new_ids: - break - ids.extend(new_ids) - last_id = ids[-1] - - url_results = self._ids_to_results(ids) - - search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) - title_span = ( - search_title('playlist-title') - or search_title('title long-title') - or search_title('title')) - title = clean_html(title_span) - - return self.playlist_result(url_results, playlist_id, title) - - def _extract_playlist(self, playlist_id): - url = self._TEMPLATE_URL % playlist_id - page = self._download_webpage(url, playlist_id) - - # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604) - for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page): - match = match.strip() - # Check if the playlist exists or is private - mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match) - if mobj: - reason = mobj.group('reason') - message = 'This playlist %s' % reason - if 'private' in reason: - message += ', use --username or --netrc to access it' - message += '.' - raise ExtractorError(message, expected=True) - elif re.match(r'[^<]*Invalid parameters[^<]*', match): - raise ExtractorError( - 'Invalid parameters. Maybe URL is incorrect.', - expected=True) - elif re.match(r'[^<]*Choose your language[^<]*', match): - continue - else: - self.report_warning('Youtube gives an alert message: ' + match) - - playlist_title = self._html_search_regex( - r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>', - page, 'title', default=None) - - _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref=' - uploader = self._html_search_regex( - r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, - page, 'uploader', default=None) - mobj = re.search( - r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE, - page) - if mobj: - uploader_id = mobj.group('uploader_id') - uploader_url = compat_urlparse.urljoin(url, mobj.group('path')) - else: - uploader_id = uploader_url = None - - has_videos = True - - if not playlist_title: - try: - # Some playlist URLs don't actually serve a playlist (e.g. - # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4) - next(self._entries(page, playlist_id)) - except StopIteration: - has_videos = False - - playlist = self.playlist_result( - self._entries(page, playlist_id), playlist_id, playlist_title) - playlist.update({ - 'uploader': uploader, - 'uploader_id': uploader_id, - 'uploader_url': uploader_url, - }) - - return has_videos, playlist - - def _check_download_just_video(self, url, playlist_id): - # Check if it's a video-specific URL - query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = query_dict.get('v', [None])[0] or self._search_regex( - r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url, - 'video id', default=None) - if video_id: - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return video_id, self.url_result(video_id, 'Youtube', video_id=video_id) - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - return video_id, None - return None, None + @classmethod + def suitable(cls, url): + return False if YoutubeTabIE.suitable(url) else super( + YoutubePlaylistIE, cls).suitable(url) def _real_extract(self, url): # Extract playlist id @@ -2907,184 +3141,23 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): if mobj is None: raise ExtractorError('Invalid URL: %s' % url) playlist_id = mobj.group(1) or mobj.group(2) - - video_id, video = self._check_download_just_video(url, playlist_id) - if video: - return video - - if playlist_id.startswith(('RD', 'UL', 'PU')): - # Mixes require a custom extraction process - return self._extract_mix(playlist_id) - - has_videos, playlist = self._extract_playlist(playlist_id) - if has_videos or not video_id: - return playlist - - # Some playlist URLs don't actually serve a playlist (see - # https://github.com/ytdl-org/youtube-dl/issues/10537). - # Fallback to plain video extraction if there is a video id - # along with playlist id. - return self.url_result(video_id, 'Youtube', video_id=video_id) + return self.url_result( + 'https://www.youtube.com/playlist?list=%s' % playlist_id, + ie=YoutubeTabIE.ie_key(), video_id=playlist_id) -class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): - IE_DESC = 'YouTube.com channels' - _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' - _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' - IE_NAME = 'youtube:channel' +class YoutubeYtUserIE(InfoExtractor): + _VALID_URL = r'ytuser:(?P<id>.+)' _TESTS = [{ - 'note': 'paginated channel', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'playlist_mincount': 91, - 'info_dict': { - 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'Uploads from lex will', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - } - }, { - 'note': 'Age restricted channel', - # from https://www.youtube.com/user/DeusExOfficial - 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w', - 'playlist_mincount': 64, - 'info_dict': { - 'id': 'UUs0ifCMCm1icqRbqhUINa0w', - 'title': 'Uploads from Deus Ex', - 'uploader': 'Deus Ex', - 'uploader_id': 'DeusExOfficial', - }, - }, { - 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', - 'only_matching': True, - }, { - 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) - else super(YoutubeChannelIE, cls).suitable(url)) - - def _build_template_url(self, url, channel_id): - return self._TEMPLATE_URL % channel_id - - def _real_extract(self, url): - channel_id = self._match_id(url) - - url = self._build_template_url(url, channel_id) - - # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) - # Workaround by extracting as a playlist if managed to obtain channel playlist URL - # otherwise fallback on channel by page extraction - channel_page = self._download_webpage( - url + '?view=57', channel_id, - 'Downloading channel page', fatal=False) - if channel_page is False: - channel_playlist_id = False - else: - channel_playlist_id = self._html_search_meta( - 'channelId', channel_page, 'channel id', default=None) - if not channel_playlist_id: - channel_url = self._html_search_meta( - ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'), - channel_page, 'channel url', default=None) - if channel_url: - channel_playlist_id = self._search_regex( - r'vnd\.youtube://user/([0-9A-Za-z_-]+)', - channel_url, 'channel id', default=None) - if channel_playlist_id and channel_playlist_id.startswith('UC'): - playlist_id = 'UU' + channel_playlist_id[2:] - return self.url_result( - compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist') - - channel_page = self._download_webpage(url, channel_id, 'Downloading page #1') - autogenerated = re.search(r'''(?x) - class="[^"]*?(?: - channel-header-autogenerated-label| - yt-channel-title-autogenerated - )[^"]*"''', channel_page) is not None - - if autogenerated: - # The videos are contained in a single page - # the ajax pages can't be used, they are empty - entries = [ - self.url_result( - video_id, 'Youtube', video_id=video_id, - video_title=video_title) - for video_id, video_title in self.extract_videos_from_page(channel_page)] - return self.playlist_result(entries, channel_id) - - try: - next(self._entries(channel_page, channel_id)) - except StopIteration: - alert_message = self._html_search_regex( - r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>', - channel_page, 'alert', default=None, group='alert') - if alert_message: - raise ExtractorError('Youtube said: %s' % alert_message, expected=True) - - return self.playlist_result(self._entries(channel_page, channel_id), channel_id) - - -class YoutubeUserIE(YoutubeChannelIE): - IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' - IE_NAME = 'youtube:user' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/TheLinuxFoundation', - 'playlist_mincount': 320, - 'info_dict': { - 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', - 'title': 'Uploads from The Linux Foundation', - 'uploader': 'The Linux Foundation', - 'uploader_id': 'TheLinuxFoundation', - } - }, { - # Only available via https://www.youtube.com/c/12minuteathlete/videos - # but not https://www.youtube.com/user/12minuteathlete/videos - 'url': 'https://www.youtube.com/c/12minuteathlete/videos', - 'playlist_mincount': 249, - 'info_dict': { - 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', - 'title': 'Uploads from 12 Minute Athlete', - 'uploader': '12 Minute Athlete', - 'uploader_id': 'the12minuteathlete', - } - }, { 'url': 'ytuser:phihag', 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/gametrailers', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/gametrailers', - 'only_matching': True, - }, { - # This channel is not available, geo restricted to JP - 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', - 'only_matching': True, }] - @classmethod - def suitable(cls, url): - # Don't return True if the url can be extracted with other youtube - # extractor, the regex would is too permissive and it would match. - other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls) - if any(ie.suitable(url) for ie in other_yt_ies): - return False - else: - return super(YoutubeUserIE, cls).suitable(url) - - def _build_template_url(self, url, channel_id): - mobj = re.match(self._VALID_URL, url) - return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id')) + def _real_extract(self, url): + user_id = self._match_id(url) + return self.url_result( + 'https://www.youtube.com/user/%s' % user_id, + ie=YoutubeTabIE.ie_key(), video_id=user_id) class YoutubeLiveIE(YoutubeBaseInfoExtractor): @@ -3139,45 +3212,7 @@ class YoutubeLiveIE(YoutubeBaseInfoExtractor): return self.url_result(base_url) -class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com user/channel playlists' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists' - IE_NAME = 'youtube:playlists' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', - 'playlist_mincount': 4, - 'info_dict': { - 'id': 'ThirstForScience', - 'title': 'ThirstForScience', - }, - }, { - # with "Load more" button - 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', - 'playlist_mincount': 70, - 'info_dict': { - 'id': 'igorkle1', - 'title': 'Игорь Клейнер', - }, - }, { - 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists', - 'playlist_mincount': 17, - 'info_dict': { - 'id': 'UCiU1dHvZObB2iP6xkJ__Icw', - 'title': 'Chem Player', - }, - 'skip': 'Blocked', - }, { - 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', - 'only_matching': True, - }] - - -class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' - - -class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): +class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com searches' # there doesn't appear to be a real limit, for example if you search for # 'python' you get more than 8.000.000 results @@ -3274,7 +3309,8 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _SEARCH_PARAMS = 'CAI%3D' -class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): +r""" +class YoutubeSearchURLIE(YoutubeSearchIE): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' @@ -3294,25 +3330,7 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): query = compat_urllib_parse_unquote_plus(mobj.group('query')) webpage = self._download_webpage(url, query) return self.playlist_result(self._process_page(webpage), playlist_title=query) - - -class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com (multi-season) shows' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)' - IE_NAME = 'youtube:show' - _TESTS = [{ - 'url': 'https://www.youtube.com/show/airdisasters', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'airdisasters', - 'title': 'Air Disasters', - } - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - return super(YoutubeShowIE, self)._real_extract( - 'https://www.youtube.com/show/%s/playlists' % playlist_id) +""" class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): @@ -3369,37 +3387,22 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): self._entries(page), playlist_title=self._PLAYLIST_TITLE) -class YoutubeWatchLaterIE(YoutubePlaylistIE): +class YoutubeWatchLaterIE(InfoExtractor): IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=WL', + 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL', 'only_matching': True, }, { - 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL', + 'url': 'https://www.youtube.com/feed/watch_later', 'only_matching': True, }] def _real_extract(self, url): - _, video = self._check_download_just_video(url, 'WL') - if video: - return video - _, playlist = self._extract_playlist('WL') - return playlist - - -class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): - IE_NAME = 'youtube:favorites' - IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' - _LOGIN_REQUIRED = True - - def _real_extract(self, url): - webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') - playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id') - return self.url_result(playlist_id, 'YoutubePlaylist') + return self.url_result( + 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key()) class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): From 5bbdadd5f886c15d1d765a21a463af899f8c30e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 12 Nov 2020 06:18:16 +0700 Subject: [PATCH 07/87] [ChangeLog] Actualize [ci skip] --- ChangeLog | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog b/ChangeLog index cfe4ab79d..73524a182 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Extractors +* [youtube] Rework extractors + + version 2020.11.01 Core From 28f9568a8460088de18210d060058598e5237022 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 12 Nov 2020 06:23:46 +0700 Subject: [PATCH 08/87] release 2020.11.12 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 8 ++------ youtube_dl/version.py | 2 +- 8 files changed, 16 insertions(+), 20 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 846c7e2ab..a10c9fd83 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.01.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.12. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.11.01.1** +- [ ] I've verified that I'm running youtube-dl version **2020.11.12** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.01.1 + [debug] youtube-dl version 2020.11.12 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 26823fc9f..9cc120d3e 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.01.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.12. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.11.01.1** +- [ ] I've verified that I'm running youtube-dl version **2020.11.12** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 3d3ba3182..29bd5f5ac 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.01.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.12. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.01.1** +- [ ] I've verified that I'm running youtube-dl version **2020.11.12** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 288bae2d4..cc33a993f 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.01.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.12. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.11.01.1** +- [ ] I've verified that I'm running youtube-dl version **2020.11.12** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.01.1 + [debug] youtube-dl version 2020.11.12 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 77a195827..cd577ecef 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.01.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.12. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.01.1** +- [ ] I've verified that I'm running youtube-dl version **2020.11.12** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 73524a182..1ef7ea7b6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2020.11.12 Extractors * [youtube] Rework extractors diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 367545a96..0c77d017e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1131,20 +1131,16 @@ - **YourPorn** - **YourUpload** - **youtube**: YouTube.com - - **youtube:channel**: YouTube.com channels - - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) - **youtube:live**: YouTube.com live streams - **youtube:playlist**: YouTube.com playlists - - **youtube:playlists**: YouTube.com user/channel playlists - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) - **youtube:search**: YouTube.com searches - **youtube:search:date**: YouTube.com searches, newest videos first - - **youtube:search_url**: YouTube.com search URLs - - **youtube:show**: YouTube.com (multi-season) shows - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) + - **youtube:tab**: YouTube.com tab - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **YoutubeYtUser** - **Zapiks** - **Zaq1** - **Zattoo** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6d7f14717..04cd207ab 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.11.01.1' +__version__ = '2020.11.12' From 1fb034d029c8b7feafe45f64e6a0808663ad315e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 16 Nov 2020 21:03:56 +0700 Subject: [PATCH 09/87] [youtube] Remove RIAA copyrighted media from tests as per [1] 1. Github dmca and 1201 notice re youtube-dl 9-21-20 --- youtube_dl/extractor/youtube.py | 145 +------------------------------- 1 file changed, 1 insertion(+), 144 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 696dec2c1..248682a41 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -509,48 +509,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'end_time': 9, } }, - { - 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY', - 'note': 'Test generic use_cipher_signature video (#897)', - 'info_dict': { - 'id': 'UxxajLWwzqY', - 'ext': 'mp4', - 'upload_date': '20120506', - 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', - 'alt_title': 'I Love It (feat. Charli XCX)', - 'description': 'md5:19a2f98d9032b9311e686ed039564f63', - 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', - 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', - 'iconic ep', 'iconic', 'love', 'it'], - 'duration': 180, - 'uploader': 'Icona Pop', - 'uploader_id': 'IconaPop', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', - 'creator': 'Icona Pop', - 'track': 'I Love It (feat. Charli XCX)', - 'artist': 'Icona Pop', - } - }, - { - 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ', - 'note': 'Test VEVO video with age protection (#956)', - 'info_dict': { - 'id': '07FYdnEawAQ', - 'ext': 'mp4', - 'upload_date': '20130703', - 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)', - 'alt_title': 'Tunnel Vision', - 'description': 'md5:07dab3356cde4199048e4c7cd93471e1', - 'duration': 419, - 'uploader': 'justintimberlakeVEVO', - 'uploader_id': 'justintimberlakeVEVO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', - 'creator': 'Justin Timberlake', - 'track': 'Tunnel Vision', - 'artist': 'Justin Timberlake', - 'age_limit': 18, - } - }, { 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', 'note': 'Embed-only video (#1746)', @@ -567,7 +525,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } }, { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ', 'note': 'Use the first video ID in the URL', 'info_dict': { 'id': 'BaW_jenozKc', @@ -626,24 +584,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'format': '141/bestaudio[ext=m4a]', }, }, - # JS player signature function name containing $ - { - 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM', - 'info_dict': { - 'id': 'nfWlot6h_JM', - 'ext': 'm4a', - 'title': 'Taylor Swift - Shake It Off', - 'description': 'md5:307195cd21ff7fa352270fe884570ef0', - 'duration': 242, - 'uploader': 'TaylorSwiftVEVO', - 'uploader_id': 'TaylorSwiftVEVO', - 'upload_date': '20140818', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141/bestaudio[ext=m4a]', - }, - }, # Controversy video { 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', @@ -675,22 +615,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'age_limit': 18, }, }, - # Age-gate video with encrypted signature - { - 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', - 'info_dict': { - 'id': '6kLq3WMV1nU', - 'ext': 'mp4', - 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', - 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', - 'duration': 246, - 'uploader': 'LloydVEVO', - 'uploader_id': 'LloydVEVO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', - 'upload_date': '20110629', - 'age_limit': 18, - }, - }, # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) # YouTube Red ad is not captured for creator { @@ -1105,73 +1029,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, - { - # Youtube Music Auto-generated description - # Retrieve 'artist' field from 'Artist:' in video description - # when it is present on youtube music video - 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY', - 'info_dict': { - 'id': 'k0jLE7tTwjY', - 'ext': 'mp4', - 'title': 'Latch Feat. Sam Smith', - 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335', - 'upload_date': '20150110', - 'uploader': 'Various Artists - Topic', - 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w', - 'artist': 'Disclosure', - 'track': 'Latch Feat. Sam Smith', - 'album': 'Latch Featuring Sam Smith', - 'release_date': '20121008', - 'release_year': 2012, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # handle multiple artists on youtube music video - 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA', - 'info_dict': { - 'id': '74qn0eJSjpA', - 'ext': 'mp4', - 'title': 'Eastside', - 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2', - 'upload_date': '20180710', - 'uploader': 'Benny Blanco - Topic', - 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A', - 'artist': 'benny blanco, Halsey, Khalid', - 'track': 'Eastside', - 'album': 'Eastside', - 'release_date': '20180713', - 'release_year': 2018, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # handle youtube music video with release_year and no release_date - 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M', - 'info_dict': { - 'id': '-hcAI0g-f5M', - 'ext': 'mp4', - 'title': 'Put It On Me', - 'description': 'md5:f6422397c07c4c907c6638e1fee380a5', - 'upload_date': '20180426', - 'uploader': 'Matt Maeson - Topic', - 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', - 'artist': 'Matt Maeson', - 'track': 'Put It On Me', - 'album': 'The Hearse', - 'release_date': None, - 'release_year': 2018, - }, - 'params': { - 'skip_download': True, - }, - }, { 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q', 'only_matching': True, From 1737ea69b978f7ebe0f52c526d3dda5ba5c196b6 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 16 Nov 2020 15:55:07 +0100 Subject: [PATCH 10/87] [cnbc] fix extraction --- youtube_dl/extractor/cnbc.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py index 6889b0f40..7b9f4536a 100644 --- a/youtube_dl/extractor/cnbc.py +++ b/youtube_dl/extractor/cnbc.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..utils import smuggle_url @@ -38,7 +39,7 @@ class CNBCIE(InfoExtractor): class CNBCVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)' + _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)' _TEST = { 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', 'info_dict': { @@ -56,11 +57,15 @@ class CNBCVideoIE(InfoExtractor): } def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id, - 'video id') + path, display_id = re.match(self._VALID_URL, url).groups() + video_id = self._download_json( + 'https://webql-redesign.cnbcfm.com/graphql', display_id, query={ + 'query': '''{ + page(path: "%s") { + vcpsId + } +}''' % path, + })['data']['page']['vcpsId'] return self.url_result( - 'http://video.cnbc.com/gallery/?video=%s' % video_id, + 'http://video.cnbc.com/gallery/?video=%d' % video_id, CNBCIE.ie_key()) From efc589b86578ad98025aa0a9ccfa5db3195c7deb Mon Sep 17 00:00:00 2001 From: Edward Betts <edward@4angle.com> Date: Mon, 16 Nov 2020 15:08:20 +0000 Subject: [PATCH 11/87] [devscripts/make_lazy_extractors] Correct a spelling mistake (#26991) --- devscripts/make_lazy_extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 0a1762dbc..878ae72b1 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -61,7 +61,7 @@ def build_lazy_ie(ie, name): return s -# find the correct sorting and add the required base classes so that sublcasses +# find the correct sorting and add the required base classes so that subclasses # can be correctly created classes = _ALL_CLASSES[:-1] ordered_cls = [] From 2ea9c97432a5342f70ed87d440cb1ec97a21cbde Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 16 Nov 2020 16:18:37 +0100 Subject: [PATCH 12/87] [nbc] fix NBCNews/Today/MSNBC extraction --- youtube_dl/extractor/nbc.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 6f3cb3003..ea5f5a315 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -10,7 +10,6 @@ from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, - js_to_json, parse_duration, smuggle_url, try_get, @@ -394,8 +393,8 @@ class NBCNewsIE(ThePlatformIE): webpage = self._download_webpage(url, video_id) data = self._parse_json(self._search_regex( - r'window\.__data\s*=\s*({.+});', webpage, - 'bootstrap json'), video_id, js_to_json) + r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', + webpage, 'bootstrap json'), video_id)['props']['initialState'] video_data = try_get(data, lambda x: x['video']['current'], dict) if not video_data: video_data = data['article']['content'][0]['primaryMedia']['video'] From 650aec4a984c3286b015de9870bef4e51bf6303f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 16 Nov 2020 16:21:48 +0100 Subject: [PATCH 13/87] [usanetwork] fix extraction --- youtube_dl/extractor/usanetwork.py | 82 ++++++------------------------ 1 file changed, 16 insertions(+), 66 deletions(-) diff --git a/youtube_dl/extractor/usanetwork.py b/youtube_dl/extractor/usanetwork.py index 54c7495cc..e3784e55f 100644 --- a/youtube_dl/extractor/usanetwork.py +++ b/youtube_dl/extractor/usanetwork.py @@ -1,74 +1,24 @@ # coding: utf-8 from __future__ import unicode_literals -from .adobepass import AdobePassIE -from ..utils import ( - NO_DEFAULT, - smuggle_url, - update_url_query, -) +from .nbc import NBCIE -class USANetworkIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity', - 'md5': '33c0d2ba381571b414024440d08d57fd', +class USANetworkIE(NBCIE): + _VALID_URL = r'https?(?P<permalink>://(?:www\.)?usanetwork\.com/[^/]+/video/[^/]+/(?P<id>\d+))' + _TESTS = [{ + 'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302', 'info_dict': { - 'id': '3086229', + 'id': '4185302', 'ext': 'mp4', - 'title': 'HPE Cybersecurity', - 'description': 'The more we digitize our world, the more vulnerable we are.', - 'upload_date': '20160818', - 'timestamp': 1471535460, - 'uploader': 'NBCU-USA', + 'title': 'Intelligence (Trailer)', + 'description': 'A maverick NSA agent enlists the help of a junior systems analyst in a workplace power grab.', + 'upload_date': '20200715', + 'timestamp': 1594785600, + 'uploader': 'NBCU-MPAT', }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def _x(name, default=NO_DEFAULT): - return self._search_regex( - r'data-%s\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % name, - webpage, name, default=default, group='value') - - video_id = _x('mpx-guid') - title = _x('episode-title') - mpx_account_id = _x('mpx-account-id', '2304992029') - - query = { - 'mbr': 'true', - } - if _x('is-full-episode', None) == '1': - query['manifest'] = 'm3u' - - if _x('is-entitlement', None) == '1': - adobe_pass = {} - drupal_settings = self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings', fatal=False) - if drupal_settings: - drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False) - if drupal_settings: - adobe_pass = drupal_settings.get('adobePass', {}) - resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId', 'usa'), - title, video_id, _x('episode-rating', 'TV-14')) - query['auth'] = self._extract_mvpd_auth( - url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource) - - info = self._search_json_ld(webpage, video_id, default={}) - info.update({ - '_type': 'url_transparent', - 'url': smuggle_url(update_url_query( - 'http://link.theplatform.com/s/HNK2IC/media/guid/%s/%s' % (mpx_account_id, video_id), - query), {'force_smil_url': True}), - 'id': video_id, - 'title': title, - 'series': _x('show-title', None), - 'episode': title, - 'ie_key': 'ThePlatform', - }) - return info + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] From 059fa9aa8119154d4b92ff31f7f65d68675b83be Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 16 Nov 2020 16:35:08 +0100 Subject: [PATCH 14/87] [vlive] fix extraction --- youtube_dl/extractor/vlive.py | 333 +++++++++++----------------------- 1 file changed, 105 insertions(+), 228 deletions(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index f79531e6f..df1dc78dd 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -1,25 +1,30 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import time import itertools +import json -from .common import InfoExtractor from .naver import NaverBaseIE -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( ExtractorError, + int_or_none, merge_dicts, - remove_start, try_get, urlencode_postdata, ) -class VLiveIE(NaverBaseIE): +class VLiveBaseIE(NaverBaseIE): + _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + + +class VLiveIE(VLiveBaseIE): IE_NAME = 'vlive' - _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P<id>[0-9]+)' _NETRC_MACHINE = 'vlive' _TESTS = [{ 'url': 'http://www.vlive.tv/video/1326', @@ -27,7 +32,7 @@ class VLiveIE(NaverBaseIE): 'info_dict': { 'id': '1326', 'ext': 'mp4', - 'title': "[V LIVE] Girl's Day's Broadcast", + 'title': "Girl's Day's Broadcast", 'creator': "Girl's Day", 'view_count': int, 'uploader_id': 'muploader_a', @@ -37,7 +42,7 @@ class VLiveIE(NaverBaseIE): 'info_dict': { 'id': '16937', 'ext': 'mp4', - 'title': '[V LIVE] 첸백시 걍방', + 'title': '첸백시 걍방', 'creator': 'EXO', 'view_count': int, 'subtitles': 'mincount:12', @@ -58,12 +63,11 @@ class VLiveIE(NaverBaseIE): 'subtitles': 'mincount:10', }, 'skip': 'This video is only available for CH+ subscribers', + }, { + 'url': 'https://www.vlive.tv/embed/1326', + 'only_matching': True, }] - @classmethod - def suitable(cls, url): - return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url) - def _real_initialize(self): self._login() @@ -95,173 +99,122 @@ class VLiveIE(NaverBaseIE): if not is_logged_in(): raise ExtractorError('Unable to log in', expected=True) + def _call_api(self, path_template, video_id, fields=None): + query = {'appId': self._APP_ID} + if fields: + query['fields'] = fields + return self._download_json( + 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id, + 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0], + headers={'Referer': 'https://www.vlive.tv/'}, query=query) + def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://www.vlive.tv/video/%s' % video_id, video_id) + try: + post = self._call_api( + 'post/v1.0/officialVideoPost-%s', video_id, + 'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId}') + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_login_required(json.loads(e.cause.read().decode())['message']) + raise - VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)' - VIDEO_PARAMS_FIELD = 'video params' + video = post['officialVideo'] - params = self._parse_json(self._search_regex( - VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD, default=''), video_id, - transform_source=lambda s: '[' + s + ']', fatal=False) + def get_common_fields(): + channel = post.get('channel') or {} + return { + 'title': video.get('title'), + 'creator': post.get('author', {}).get('nickname'), + 'channel': channel.get('channelName'), + 'channel_id': channel.get('channelCode'), + 'duration': int_or_none(video.get('playTime')), + 'view_count': int_or_none(video.get('playCount')), + 'like_count': int_or_none(video.get('likeCount')), + 'comment_count': int_or_none(video.get('commentCount')), + } - if not params or len(params) < 7: - params = self._search_regex( - VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD) - params = [p.strip(r'"') for p in re.split(r'\s*,\s*', params)] - - status, long_video_id, key = params[2], params[5], params[6] - status = remove_start(status, 'PRODUCT_') - - if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'): - return self._live(video_id, webpage) - elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'): - return self._replay(video_id, webpage, long_video_id, key) - - if status == 'LIVE_END': - raise ExtractorError('Uploading for replay. Please wait...', - expected=True) - elif status == 'COMING_SOON': - raise ExtractorError('Coming soon!', expected=True) - elif status == 'CANCELED': - raise ExtractorError('We are sorry, ' - 'but the live broadcast has been canceled.', - expected=True) - elif status == 'ONLY_APP': - raise ExtractorError('Unsupported video type', expected=True) - else: - raise ExtractorError('Unknown status %s' % status) - - def _get_common_fields(self, webpage): - title = self._og_search_title(webpage) - creator = self._html_search_regex( - r'<div[^>]+class="info_area"[^>]*>\s*(?:<em[^>]*>.*?</em\s*>\s*)?<a\s+[^>]*>([^<]+)', - webpage, 'creator', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) - return { - 'title': title, - 'creator': creator, - 'thumbnail': thumbnail, - } - - def _live(self, video_id, webpage): - init_page = self._download_init_page(video_id) - - live_params = self._search_regex( - r'"liveStreamInfo"\s*:\s*(".*"),', - init_page, 'live stream info') - live_params = self._parse_json(live_params, video_id) - live_params = self._parse_json(live_params, video_id) - - formats = [] - for vid in live_params.get('resolutions', []): - formats.extend(self._extract_m3u8_formats( - vid['cdnUrl'], video_id, 'mp4', - m3u8_id=vid.get('name'), - fatal=False, live=True)) - self._sort_formats(formats) - - info = self._get_common_fields(webpage) - info.update({ - 'title': self._live_title(info['title']), - 'id': video_id, - 'formats': formats, - 'is_live': True, - }) - return info - - def _replay(self, video_id, webpage, long_video_id, key): - if '' in (long_video_id, key): - init_page = self._download_init_page(video_id) - video_info = self._parse_json(self._search_regex( - (r'(?s)oVideoStatus\s*=\s*({.+?})\s*</script', - r'(?s)oVideoStatus\s*=\s*({.+})'), init_page, 'video info'), - video_id) - if video_info.get('status') == 'NEED_CHANNEL_PLUS': - self.raise_login_required( - 'This video is only available for CH+ subscribers') - long_video_id, key = video_info['vid'], video_info['inkey'] - - return merge_dicts( - self._get_common_fields(webpage), - self._extract_video_info(video_id, long_video_id, key)) - - def _download_init_page(self, video_id): - return self._download_webpage( - 'https://www.vlive.tv/video/init/view', - video_id, note='Downloading live webpage', - data=urlencode_postdata({'videoSeq': video_id}), - headers={ - 'Referer': 'https://www.vlive.tv/video/%s' % video_id, - 'Content-Type': 'application/x-www-form-urlencoded' - }) + video_type = video.get('type') + if video_type == 'VOD': + inkey = self._call_api('video/v1.0/vod/%s/inkey', video_id)['inkey'] + vod_id = video['vodId'] + return merge_dicts( + get_common_fields(), + self._extract_video_info(video_id, vod_id, inkey)) + elif video_type == 'LIVE': + status = video.get('status') + if status == 'ON_AIR': + stream_url = self._call_api( + 'old/v3/live/%s/playInfo', + video_id)['result']['adaptiveStreamUrl'] + formats = self._extract_m3u8_formats(stream_url, video_id, 'mp4') + info = get_common_fields() + info.update({ + 'title': self._live_title(video['title']), + 'id': video_id, + 'formats': formats, + 'is_live': True, + }) + return info + elif status == 'ENDED': + raise ExtractorError( + 'Uploading for replay. Please wait...', expected=True) + elif status == 'RESERVED': + raise ExtractorError('Coming soon!', expected=True) + elif video.get('exposeStatus') == 'CANCEL': + raise ExtractorError( + 'We are sorry, but the live broadcast has been canceled.', + expected=True) + else: + raise ExtractorError('Unknown status ' + status) -class VLiveChannelIE(InfoExtractor): +class VLiveChannelIE(VLiveBaseIE): IE_NAME = 'vlive:channel' - _VALID_URL = r'https?://channels\.vlive\.tv/(?P<id>[0-9A-Z]+)' - _TEST = { + _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<id>[0-9A-Z]+)' + _TESTS = [{ 'url': 'http://channels.vlive.tv/FCD4B', 'info_dict': { 'id': 'FCD4B', 'title': 'MAMAMOO', }, 'playlist_mincount': 110 - } - _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + }, { + 'url': 'https://www.vlive.tv/channel/FCD4B', + 'only_matching': True, + }] + + def _call_api(self, path, channel_key_suffix, channel_value, note, query): + q = { + 'app_id': self._APP_ID, + 'channel' + channel_key_suffix: channel_value, + } + q.update(query) + return self._download_json( + 'http://api.vfan.vlive.tv/vproxy/channelplus/' + path, + channel_value, note='Downloading ' + note, query=q)['result'] def _real_extract(self, url): channel_code = self._match_id(url) - webpage = self._download_webpage( - 'http://channels.vlive.tv/%s/video' % channel_code, channel_code) + channel_seq = self._call_api( + 'decodeChannelCode', 'Code', channel_code, + 'decode channel code', {})['channelSeq'] - app_id = None - - app_js_url = self._search_regex( - r'<script[^>]+src=(["\'])(?P<url>http.+?/app\.js.*?)\1', - webpage, 'app js', default=None, group='url') - - if app_js_url: - app_js = self._download_webpage( - app_js_url, channel_code, 'Downloading app JS', fatal=False) - if app_js: - app_id = self._search_regex( - r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]', - app_js, 'app id', default=None) - - app_id = app_id or self._APP_ID - - channel_info = self._download_json( - 'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode', - channel_code, note='Downloading decode channel code', - query={ - 'app_id': app_id, - 'channelCode': channel_code, - '_': int(time.time()) - }) - - channel_seq = channel_info['result']['channelSeq'] channel_name = None entries = [] for page_num in itertools.count(1): - video_list = self._download_json( - 'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList', - channel_code, note='Downloading channel list page #%d' % page_num, - query={ - 'app_id': app_id, - 'channelSeq': channel_seq, + video_list = self._call_api( + 'getChannelVideoList', 'Seq', channel_seq, + 'channel list page #%d' % page_num, { # Large values of maxNumOfRows (~300 or above) may cause # empty responses (see [1]), e.g. this happens for [2] that # has more than 300 videos. # 1. https://github.com/ytdl-org/youtube-dl/issues/13830 # 2. http://channels.vlive.tv/EDBF. 'maxNumOfRows': 100, - '_': int(time.time()), 'pageNo': page_num } ) @@ -269,11 +222,11 @@ class VLiveChannelIE(InfoExtractor): if not channel_name: channel_name = try_get( video_list, - lambda x: x['result']['channelInfo']['channelName'], + lambda x: x['channelInfo']['channelName'], compat_str) videos = try_get( - video_list, lambda x: x['result']['videoList'], list) + video_list, lambda x: x['videoList'], list) if not videos: break @@ -289,79 +242,3 @@ class VLiveChannelIE(InfoExtractor): return self.playlist_result( entries, channel_code, channel_name) - - -class VLivePlaylistIE(InfoExtractor): - IE_NAME = 'vlive:playlist' - _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)' - _VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s' - _TESTS = [{ - # regular working playlist - 'url': 'https://www.vlive.tv/video/117956/playlist/117963', - 'info_dict': { - 'id': '117963', - 'title': '아이돌룸(IDOL ROOM) 41회 - (여자)아이들' - }, - 'playlist_mincount': 10 - }, { - # playlist with no playlistVideoSeqs - 'url': 'http://www.vlive.tv/video/22867/playlist/22912', - 'info_dict': { - 'id': '22867', - 'ext': 'mp4', - 'title': '[V LIVE] Valentine Day Message from MINA', - 'creator': 'TWICE', - 'view_count': int - }, - 'params': { - 'skip_download': True, - } - }] - - def _build_video_result(self, video_id, message): - self.to_screen(message) - return self.url_result( - self._VIDEO_URL_TEMPLATE % video_id, - ie=VLiveIE.ie_key(), video_id=video_id) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, playlist_id = mobj.group('video_id', 'id') - - if self._downloader.params.get('noplaylist'): - return self._build_video_result( - video_id, - 'Downloading just video %s because of --no-playlist' - % video_id) - - self.to_screen( - 'Downloading playlist %s - add --no-playlist to just download video' - % playlist_id) - - webpage = self._download_webpage( - 'http://www.vlive.tv/video/%s/playlist/%s' - % (video_id, playlist_id), playlist_id) - - raw_item_ids = self._search_regex( - r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage, - 'playlist video seqs', default=None, fatal=False) - - if not raw_item_ids: - return self._build_video_result( - video_id, - 'Downloading just video %s because no playlist was found' - % video_id) - - item_ids = self._parse_json(raw_item_ids, playlist_id) - - entries = [ - self.url_result( - self._VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(), - video_id=compat_str(item_id)) - for item_id in item_ids] - - playlist_name = self._html_search_regex( - r'<div[^>]+class="[^"]*multicam_playlist[^>]*>\s*<h3[^>]+>([^<]+)', - webpage, 'playlist title', fatal=False) - - return self.playlist_result(entries, playlist_id, playlist_name) From f22fa82d7ffed28e0a1dd7b8370a699ee5d894d6 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 16 Nov 2020 16:36:56 +0100 Subject: [PATCH 15/87] [extractors] Remove VLivePlaylistIE import --- youtube_dl/extractor/extractors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9d7fecfe8..302ce6be4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1357,7 +1357,6 @@ from .vk import ( from .vlive import ( VLiveIE, VLiveChannelIE, - VLivePlaylistIE ) from .vodlocker import VodlockerIE from .vodpl import VODPlIE From fe13087cd152c7eb98ea418bad0419a15c767c08 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 16 Nov 2020 17:17:52 +0100 Subject: [PATCH 16/87] [rai] fix RaiPlay extraction --- youtube_dl/extractor/rai.py | 61 ++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 207a6c247..bee2d53f5 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -16,7 +17,6 @@ from ..utils import ( int_or_none, parse_duration, strip_or_none, - try_get, unescapeHTML, unified_strdate, unified_timestamp, @@ -141,6 +141,7 @@ class RaiPlayIE(RaiBaseIE): 'series': 'La Casa Bianca', 'season': '2016', }, + 'skip': 'This content is not available', }, { 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', @@ -148,14 +149,12 @@ class RaiPlayIE(RaiBaseIE): 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', 'ext': 'mp4', 'title': 'Report del 07/04/2014', - 'alt_title': 'S2013/14 - Puntata del 07/04/2014', - 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', + 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014', + 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 5', - 'creator': 'Rai 5', + 'uploader': 'Rai Gulp', 'duration': 6160, 'series': 'Report', - 'season_number': 5, 'season': '2013/14', }, 'params': { @@ -167,48 +166,51 @@ class RaiPlayIE(RaiBaseIE): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - url, video_id = mobj.group('url', 'id') + url, video_id = re.match(self._VALID_URL, url).groups() media = self._download_json( - '%s?json' % url, video_id, 'Downloading video JSON') + url.replace('.html', '.json'), video_id, 'Downloading video JSON') title = media['name'] video = media['video'] - relinker_info = self._extract_relinker_info(video['contentUrl'], video_id) + relinker_info = self._extract_relinker_info(video['content_url'], video_id) self._sort_formats(relinker_info['formats']) thumbnails = [] - if 'images' in media: - for _, value in media.get('images').items(): - if value: - thumbnails.append({ - 'url': value.replace('[RESOLUTION]', '600x400') - }) + for _, value in media.get('images', {}).items(): + if value: + thumbnails.append({ + 'url': urljoin(url, value), + }) - timestamp = unified_timestamp(try_get( - media, lambda x: x['availabilities'][0]['start'], compat_str)) + date_published = media.get('date_published') + time_published = media.get('time_published') + if date_published and time_published: + date_published += ' ' + time_published subtitles = self._extract_subtitles(url, video.get('subtitles')) + program_info = media.get('program_info') or {} + season = media.get('season') + info = { 'id': video_id, 'title': self._live_title(title) if relinker_info.get( 'is_live') else title, - 'alt_title': media.get('subtitle'), + 'alt_title': strip_or_none(media.get('subtitle')), 'description': media.get('description'), 'uploader': strip_or_none(media.get('channel')), - 'creator': strip_or_none(media.get('editor')), + 'creator': strip_or_none(media.get('editor') or None), 'duration': parse_duration(video.get('duration')), - 'timestamp': timestamp, + 'timestamp': unified_timestamp(date_published), 'thumbnails': thumbnails, - 'series': try_get( - media, lambda x: x['isPartOf']['name'], compat_str), - 'season_number': int_or_none(try_get( - media, lambda x: x['isPartOf']['numeroStagioni'])), - 'season': media.get('stagione') or None, + 'series': program_info.get('name'), + 'season_number': int_or_none(season), + 'season': season if (season and not season.isdigit()) else None, + 'episode': media.get('episode_title'), + 'episode_number': int_or_none(media.get('episode')), 'subtitles': subtitles, } @@ -300,7 +302,8 @@ class RaiIE(RaiBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1758, 'upload_date': '20140612', - } + }, + 'skip': 'This content is available only in Italy', }, { # with ContentItem in many metas 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', @@ -316,7 +319,7 @@ class RaiIE(RaiBaseIE): }, { # with ContentItem in og:url 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', - 'md5': '11959b4e44fa74de47011b5799490adf', + 'md5': '6865dd00cf0bbf5772fdd89d59bd768a', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', @@ -338,6 +341,7 @@ class RaiIE(RaiBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20141221', }, + 'skip': 'This content is not available', }, { # initEdizione('ContentItem-...' 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', @@ -360,6 +364,7 @@ class RaiIE(RaiBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'This content is available only in Italy', }, { # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', From 3f1748b9445e9d9367d29221c4b7bf9b88895e4e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 16 Nov 2020 17:26:09 +0100 Subject: [PATCH 17/87] [bandcamp] fix extraction --- youtube_dl/extractor/bandcamp.py | 149 ++++++++++++------------------- 1 file changed, 58 insertions(+), 91 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index f14b407dc..731c7c25c 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import random @@ -5,10 +6,7 @@ import re import time from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, float_or_none, @@ -17,30 +15,32 @@ from ..utils import ( parse_filesize, str_or_none, try_get, - unescapeHTML, update_url_query, unified_strdate, unified_timestamp, url_or_none, + urljoin, ) class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)' + _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭", 'duration': 9.8485, + 'uploader': 'youtube-dl "\'/\\ä↭', + 'upload_date': '20121129', + 'timestamp': 1354224127, }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { # free download 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '853e35bf34aa1d6fe2615ae612564b36', 'info_dict': { 'id': '2650410135', 'ext': 'aiff', @@ -79,11 +79,16 @@ class BandcampIE(InfoExtractor): }, }] + def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): + return self._parse_json(self._html_search_regex( + r'data-%s=(["\'])({.+?})\1' % attr, webpage, + attr + ' data', group=2), video_id, fatal=fatal) + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') + title = self._match_id(url) webpage = self._download_webpage(url, title) - thumbnail = self._html_search_meta('og:image', webpage, default=None) + tralbum = self._extract_data_attr(webpage, title) + thumbnail = self._og_search_thumbnail(webpage) track_id = None track = None @@ -91,10 +96,7 @@ class BandcampIE(InfoExtractor): duration = None formats = [] - track_info = self._parse_json( - self._search_regex( - r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n', - webpage, 'track info', default='{}'), title) + track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict) if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -111,37 +113,25 @@ class BandcampIE(InfoExtractor): 'abr': int_or_none(abr_str), }) track = track_info.get('title') - track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) + track_id = str_or_none( + track_info.get('track_id') or track_info.get('id')) track_number = int_or_none(track_info.get('track_num')) duration = float_or_none(track_info.get('duration')) - def extract(key): - return self._search_regex( - r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key, - webpage, key, default=None, group='value') - - artist = extract('artist') - album = extract('album_title') + embed = self._extract_data_attr(webpage, title, 'embed', False) + current = tralbum.get('current') or {} + artist = embed.get('artist') or current.get('artist') or tralbum.get('artist') timestamp = unified_timestamp( - extract('publish_date') or extract('album_publish_date')) - release_date = unified_strdate(extract('album_release_date')) + current.get('publish_date') or tralbum.get('album_publish_date')) - download_link = self._search_regex( - r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'download link', default=None, group='url') + download_link = tralbum.get('freeDownloadPage') if download_link: - track_id = self._search_regex( - r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', - webpage, 'track id') + track_id = compat_str(tralbum['id']) download_webpage = self._download_webpage( download_link, track_id, 'Downloading free downloads page') - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, - 'blob', group='blob'), - track_id, transform_source=unescapeHTML) + blob = self._extract_data_attr(download_webpage, track_id, 'blob') info = try_get( blob, (lambda x: x['digital_items'][0], @@ -207,20 +197,20 @@ class BandcampIE(InfoExtractor): 'thumbnail': thumbnail, 'uploader': artist, 'timestamp': timestamp, - 'release_date': release_date, + 'release_date': unified_strdate(tralbum.get('album_release_date')), 'duration': duration, 'track': track, 'track_number': track_number, 'track_id': track_id, 'artist': artist, - 'album': album, + 'album': embed.get('album_title'), 'formats': formats, } -class BandcampAlbumIE(InfoExtractor): +class BandcampAlbumIE(BandcampIE): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?' _TESTS = [{ 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -230,7 +220,10 @@ class BandcampAlbumIE(InfoExtractor): 'info_dict': { 'id': '1353101989', 'ext': 'mp3', - 'title': 'Intro', + 'title': 'Blazo - Intro', + 'timestamp': 1311756226, + 'upload_date': '20110727', + 'uploader': 'Blazo', } }, { @@ -238,7 +231,10 @@ class BandcampAlbumIE(InfoExtractor): 'info_dict': { 'id': '38097443', 'ext': 'mp3', - 'title': 'Kero One - Keep It Alive (Blazo remix)', + 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)', + 'timestamp': 1311757238, + 'upload_date': '20110727', + 'uploader': 'Blazo', } }, ], @@ -294,41 +290,31 @@ class BandcampAlbumIE(InfoExtractor): else super(BandcampAlbumIE, cls).suitable(url)) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - uploader_id = mobj.group('subdomain') - album_id = mobj.group('album_id') + uploader_id, album_id = re.match(self._VALID_URL, url).groups() playlist_id = album_id or uploader_id webpage = self._download_webpage(url, playlist_id) - track_elements = re.findall( - r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage) - if not track_elements: + tralbum = self._extract_data_attr(webpage, playlist_id) + track_info = tralbum.get('trackinfo') + if not track_info: raise ExtractorError('The page doesn\'t contain any tracks') # Only tracks with duration info have songs entries = [ self.url_result( - compat_urlparse.urljoin(url, t_path), - ie=BandcampIE.ie_key(), - video_title=self._search_regex( - r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', - elem_content, 'track title', fatal=False)) - for elem_content, t_path in track_elements - if self._html_search_meta('duration', elem_content, default=None)] + urljoin(url, t['title_link']), BandcampIE.ie_key(), + str_or_none(t.get('track_id') or t.get('id')), t.get('title')) + for t in track_info + if t.get('duration')] - title = self._html_search_regex( - r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', - webpage, 'title', fatal=False) - if title: - title = title.replace(r'\"', '"') return { '_type': 'playlist', 'uploader_id': uploader_id, 'id': playlist_id, - 'title': title, + 'title': try_get(tralbum, lambda x: x['current']['title'], compat_str), 'entries': entries, } -class BandcampWeeklyIE(InfoExtractor): +class BandcampWeeklyIE(BandcampIE): IE_NAME = 'Bandcamp:weekly' _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' _TESTS = [{ @@ -343,29 +329,23 @@ class BandcampWeeklyIE(InfoExtractor): 'release_date': '20170404', 'series': 'Bandcamp Weekly', 'episode': 'Magic Moments', - 'episode_number': 208, 'episode_id': '224', - } + }, + 'params': { + 'format': 'opus-lo', + }, }, { 'url': 'https://bandcamp.com/?blah/blah@&show=228', 'only_matching': True }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage, - 'blob', group='blob'), - video_id, transform_source=unescapeHTML) + blob = self._extract_data_attr(webpage, show_id, 'blob') - show = blob['bcw_show'] - - # This is desired because any invalid show id redirects to `bandcamp.com` - # which happens to expose the latest Bandcamp Weekly episode. - show_id = int_or_none(show.get('show_id')) or int_or_none(video_id) + show = blob['bcw_data'][show_id] formats = [] for format_id, format_url in show['audio_stream'].items(): @@ -390,20 +370,8 @@ class BandcampWeeklyIE(InfoExtractor): if subtitle: title += ' - %s' % subtitle - episode_number = None - seq = blob.get('bcw_seq') - - if seq and isinstance(seq, list): - try: - episode_number = next( - int_or_none(e.get('episode_number')) - for e in seq - if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) - except StopIteration: - pass - return { - 'id': video_id, + 'id': show_id, 'title': title, 'description': show.get('desc') or show.get('short_desc'), 'duration': float_or_none(show.get('audio_duration')), @@ -411,7 +379,6 @@ class BandcampWeeklyIE(InfoExtractor): 'release_date': unified_strdate(show.get('published_date')), 'series': 'Bandcamp Weekly', 'episode': show.get('subtitle'), - 'episode_number': episode_number, - 'episode_id': compat_str(video_id), + 'episode_id': show_id, 'formats': formats } From 9448a203126105d6462299bddbe3a6a32bc017fd Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 16 Nov 2020 18:57:33 +0100 Subject: [PATCH 18/87] [condenast] fix extraction and extract subtitles --- youtube_dl/extractor/condenast.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index ed278fefc..d5e77af32 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -16,6 +16,8 @@ from ..utils import ( mimetype2ext, orderedSet, parse_iso8601, + strip_or_none, + try_get, ) @@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor): 'uploader': 'gq', 'upload_date': '20170321', 'timestamp': 1490126427, + 'description': 'How much grimmer would things be if these people were competent?', }, }, { # JS embed @@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor): 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', 'uploader': 'arstechnica', 'upload_date': '20150916', - 'timestamp': 1442434955, + 'timestamp': 1442434920, } }, { 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', @@ -196,6 +199,13 @@ class CondeNastIE(InfoExtractor): }) self._sort_formats(formats) + subtitles = {} + for t, caption in video_info.get('captions', {}).items(): + caption_url = caption.get('src') + if not (t in ('vtt', 'srt', 'tml') and caption_url): + continue + subtitles.setdefault('en', []).append({'url': caption_url}) + return { 'id': video_id, 'formats': formats, @@ -208,6 +218,7 @@ class CondeNastIE(InfoExtractor): 'season': video_info.get('season_title'), 'timestamp': parse_iso8601(video_info.get('premiere_date')), 'categories': video_info.get('categories'), + 'subtitles': subtitles, } def _real_extract(self, url): @@ -225,8 +236,16 @@ class CondeNastIE(InfoExtractor): if url_type == 'series': return self._extract_series(url, webpage) else: - params = self._extract_video_params(webpage, display_id) - info = self._search_json_ld( - webpage, display_id, fatal=False) + video = try_get(self._parse_json(self._search_regex( + r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage, + 'preload state', '{}'), display_id), + lambda x: x['transformed']['video']) + if video: + params = {'videoId': video['id']} + info = {'description': strip_or_none(video.get('description'))} + else: + params = self._extract_video_params(webpage, display_id) + info = self._search_json_ld( + webpage, display_id, fatal=False) info.update(self._extract_video(params)) return info From ec2a2ab44132e8000cf0a0a81b793c3ea5fc1903 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 16 Nov 2020 19:04:55 +0100 Subject: [PATCH 19/87] [lbry] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/lbry.py | 85 ++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 youtube_dl/extractor/lbry.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 302ce6be4..b2baf8057 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -529,6 +529,7 @@ from .laola1tv import ( EHFTVIE, ITTFIE, ) +from .lbry import LBRYIE from .lci import LCIIE from .lcp import ( LcpPlayIE, diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py new file mode 100644 index 000000000..587deac90 --- /dev/null +++ b/youtube_dl/extractor/lbry.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + mimetype2ext, + try_get, +) + + +class LBRYIE(InfoExtractor): + IE_NAME = 'lbry.tv' + _VALID_URL = r'https?://(?:www\.)?lbry\.tv/(?P<id>@[0-9a-zA-Z-]+:[0-9a-z]+/[0-9a-zA-Z().-]+:[0-9a-z])' + _TESTS = [{ + # Video + 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', + 'md5': '65bd7ec1f6744ada55da8e4c48a2edf9', + 'info_dict': { + 'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d', + 'ext': 'mp4', + 'title': 'First day in LBRY? Start HERE!', + 'description': 'md5:f6cb5c704b332d37f5119313c2c98f51', + 'timestamp': 1595694354, + 'upload_date': '20200725', + } + }, { + # Audio + 'url': 'https://lbry.tv/@LBRYFoundation:0/Episode-1:e', + 'md5': 'c94017d3eba9b49ce085a8fad6b98d00', + 'info_dict': { + 'id': 'e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', + 'ext': 'mp3', + 'title': 'The LBRY Foundation Community Podcast Episode 1 - Introduction, Streaming on LBRY, Transcoding', + 'description': 'md5:661ac4f1db09f31728931d7b88807a61', + 'timestamp': 1591312601, + 'upload_date': '20200604', + } + }] + + def _call_api_proxy(self, method, display_id, params): + return self._download_json( + 'https://api.lbry.tv/api/v1/proxy', display_id, + headers={'Content-Type': 'application/json-rpc'}, + data=json.dumps({ + 'method': method, + 'params': params, + }).encode())['result'] + + def _real_extract(self, url): + display_id = self._match_id(url).replace(':', '#') + uri = 'lbry://' + display_id + result = self._call_api_proxy( + 'resolve', display_id, {'urls': [uri]})[uri] + result_value = result['value'] + if result_value.get('stream_type') not in ('video', 'audio'): + raise ExtractorError('Unsupported URL', expected=True) + streaming_url = self._call_api_proxy( + 'get', display_id, {'uri': uri})['streaming_url'] + source = result_value.get('source') or {} + media = result_value.get('video') or result_value.get('audio') or {} + signing_channel = result_value.get('signing_channel') or {} + + return { + 'id': result['claim_id'], + 'title': result_value['title'], + 'thumbnail': try_get(result_value, lambda x: x['thumbnail']['url'], compat_str), + 'description': result_value.get('description'), + 'license': result_value.get('license'), + 'timestamp': int_or_none(result.get('timestamp')), + 'tags': result_value.get('tags'), + 'width': int_or_none(media.get('width')), + 'height': int_or_none(media.get('height')), + 'duration': int_or_none(media.get('duration')), + 'channel': signing_channel.get('name'), + 'channel_id': signing_channel.get('claim_id'), + 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')), + 'filesize': int_or_none(source.get('size')), + 'url': streaming_url, + } From 6d3bdcf2177ce75f8f95731186a4794412b9776d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 16 Nov 2020 19:17:10 +0100 Subject: [PATCH 20/87] [lrt] fix extraction --- youtube_dl/extractor/lrt.py | 91 +++++++++++++++---------------------- 1 file changed, 36 insertions(+), 55 deletions(-) diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index f5c997ef4..a89434adb 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -5,28 +5,26 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, - int_or_none, - parse_duration, - remove_end, + clean_html, + merge_dicts, ) class LRTIE(InfoExtractor): IE_NAME = 'lrt.lt' - _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P<path>/mediateka/irasas/(?P<id>[0-9]+))' _TESTS = [{ # m3u8 download - 'url': 'http://www.lrt.lt/mediateka/irasas/54391/', - 'md5': 'fe44cf7e4ab3198055f2c598fc175cb0', + 'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene', + 'md5': '85cb2bb530f31d91a9c65b479516ade4', 'info_dict': { - 'id': '54391', + 'id': '2000127261', 'ext': 'mp4', - 'title': 'Septynios Kauno dienos', - 'description': 'md5:24d84534c7dc76581e59f5689462411a', - 'duration': 1783, - 'view_count': int, - 'like_count': int, + 'title': 'Greita ir gardu: Sicilijos įkvėpta klasikinių makaronų su baklažanais vakarienė', + 'description': 'md5:ad7d985f51b0dc1489ba2d76d7ed47fa', + 'duration': 3035, + 'timestamp': 1604079000, + 'upload_date': '20201030', }, }, { # direct mp3 download @@ -43,52 +41,35 @@ class LRTIE(InfoExtractor): }, }] + def _extract_js_var(self, webpage, var_name, default): + return self._search_regex( + r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name, + webpage, var_name.replace('_', ' '), default, group=2) + def _real_extract(self, url): - video_id = self._match_id(url) + path, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id) - title = remove_end(self._og_search_title(webpage), ' - LRT') + media_url = self._extract_js_var(webpage, 'main_url', path) + media = self._download_json(self._extract_js_var( + webpage, 'media_info_url', + 'https://www.lrt.lt/servisai/stream_url/vod/media_info/'), + video_id, query={'url': media_url}) + jw_data = self._parse_jwplayer_data( + media['playlist_item'], video_id, base_url=url) - formats = [] - for _, file_url in re.findall( - r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): - ext = determine_ext(file_url) - if ext not in ('m3u8', 'mp3'): + json_ld_data = self._search_json_ld(webpage, video_id) + + tags = [] + for tag in media.get('tags', []): + tag_name = tag.get('name') + if not tag_name: continue - # mp3 served as m3u8 produces stuttered media file - if ext == 'm3u8' and '.mp3' in file_url: - continue - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - file_url, video_id, 'mp4', entry_protocol='m3u8_native', - fatal=False)) - elif ext == 'mp3': - formats.append({ - 'url': file_url, - 'vcodec': 'none', - }) - self._sort_formats(formats) + tags.append(tag_name) - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description(webpage) - duration = parse_duration(self._search_regex( - r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1', - webpage, 'duration', default=None, group='duration')) - - view_count = int_or_none(self._html_search_regex( - r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>', - webpage, 'view count', fatal=False, group='count')) - like_count = int_or_none(self._search_regex( - r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<', - webpage, 'like count', fatal=False, group='count')) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, + clean_info = { + 'description': clean_html(media.get('content')), + 'tags': tags, } + + return merge_dicts(clean_info, jw_data, json_ld_data) From fe07e788bf7718d429d6fc7e4bcb0c761ffd2cfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 17 Nov 2020 01:30:43 +0700 Subject: [PATCH 21/87] [utils] Skip ! prefixed code in js_to_json --- test/test_utils.py | 22 ++++++++++++++++++++++ youtube_dl/utils.py | 5 +++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index c2d1e4fb1..925a21d34 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -937,6 +937,28 @@ class TestUtil(unittest.TestCase): self.assertEqual(d['x'], 1) self.assertEqual(d['y'], 'a') + # Just drop ! prefix for now though this results in a wrong value + on = js_to_json('''{ + a: !0, + b: !1, + c: !!0, + d: !!42.42, + e: !!![], + f: !"abc", + g: !"", + !42: 42 + }''') + self.assertEqual(json.loads(on), { + 'a': 0, + 'b': 1, + 'c': 0, + 'd': 42.42, + 'e': [], + 'f': "abc", + 'g': "", + '42': 42 + }) + on = js_to_json('["abc", "def",]') self.assertEqual(json.loads(on), ['abc', 'def']) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 737e2810e..321f903ab 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -4078,7 +4078,7 @@ def js_to_json(code): v = m.group(0) if v in ('true', 'false', 'null'): return v - elif v.startswith('/*') or v.startswith('//') or v == ',': + elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': return "" if v[0] in ("'", '"'): @@ -4103,7 +4103,8 @@ def js_to_json(code): {comment}|,(?={skip}[\]}}])| (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*| \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| - [0-9]+(?={skip}:) + [0-9]+(?={skip}:)| + !+ '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) From 2e7fa18bb96dac31bda754f9f7f0ab21a6513166 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 17 Nov 2020 01:32:48 +0700 Subject: [PATCH 22/87] [xtube] Fix extraction (closes #26996) --- youtube_dl/extractor/xtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 01b253dcb..18969058f 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -90,7 +90,7 @@ class XTubeIE(InfoExtractor): title, thumbnail, duration = [None] * 3 config = self._parse_json(self._search_regex( - r'playerConf\s*=\s*({.+?})\s*,\s*\n', webpage, 'config', + r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config', default='{}'), video_id, transform_source=js_to_json, fatal=False) if config: config = config.get('mainRoll') From a80b23c373202b35a2b00ba2455d0e2fcf8d366f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 17 Nov 2020 02:57:26 +0700 Subject: [PATCH 23/87] [servus] Fix extraction (closes #26872, closes #26967, closes #26983, closes #27000) --- youtube_dl/extractor/servus.py | 106 ++++++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/servus.py b/youtube_dl/extractor/servus.py index 9401bf2cf..206bc1801 100644 --- a/youtube_dl/extractor/servus.py +++ b/youtube_dl/extractor/servus.py @@ -1,9 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + unified_timestamp, + urlencode_postdata, + url_or_none, +) class ServusIE(InfoExtractor): @@ -19,13 +25,22 @@ class ServusIE(InfoExtractor): _TESTS = [{ # new URL schema 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', - 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', + 'md5': '60474d4c21f3eb148838f215c37f02b9', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', 'ext': 'mp4', 'title': 'Die Grünen aus Sicht des Volkes', + 'alt_title': 'Talk im Hangar-7 Voxpops Gruene', 'description': 'md5:1247204d85783afe3682644398ff2ec4', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 62.442, + 'timestamp': 1605193976, + 'upload_date': '20201112', + 'series': 'Talk im Hangar-7', + 'season': 'Season 9', + 'season_number': 9, + 'episode': 'Episode 31 - September 14', + 'episode_number': 31, } }, { # old URL schema @@ -44,26 +59,87 @@ class ServusIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url).upper() - webpage = self._download_webpage(url, video_id) - title = self._search_regex( - (r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', - r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'), - webpage, 'title', default=None, - group='title') or self._og_search_title(webpage) - title = re.sub(r'\s*-\s*Servus TV\s*$', '', title) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) + token = self._download_json( + 'https://auth.redbullmediahouse.com/token', video_id, + 'Downloading token', data=urlencode_postdata({ + 'grant_type': 'client_credentials', + }), headers={ + 'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==', + }) + access_token = token['access_token'] + token_type = token.get('token_type', 'Bearer') - formats = self._extract_m3u8_formats( - 'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id, - video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + video = self._download_json( + 'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id, + video_id, 'Downloading video JSON', headers={ + 'Authorization': '%s %s' % (token_type, access_token), + }) + + formats = [] + thumbnail = None + for resource in video['resources']: + if not isinstance(resource, dict): + continue + format_url = url_or_none(resource.get('url')) + if not format_url: + continue + extension = resource.get('extension') + type_ = resource.get('type') + if extension == 'jpg' or type_ == 'reference_keyframe': + thumbnail = format_url + continue + ext = determine_ext(format_url) + if type_ == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + elif type_ == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif extension == 'mp4' or ext == 'mp4': + formats.append({ + 'url': format_url, + 'format_id': type_, + 'width': int_or_none(resource.get('width')), + 'height': int_or_none(resource.get('height')), + }) self._sort_formats(formats) + attrs = {} + for attribute in video['attributes']: + if not isinstance(attribute, dict): + continue + key = attribute.get('fieldKey') + value = attribute.get('fieldValue') + if not key or not value: + continue + attrs[key] = value + + title = attrs.get('title_stv') or video_id + alt_title = attrs.get('title') + description = attrs.get('long_description') or attrs.get('short_description') + series = attrs.get('label') + season = attrs.get('season') + episode = attrs.get('chapter') + duration = float_or_none(attrs.get('duration'), scale=1000) + season_number = int_or_none(self._search_regex( + r'Season (\d+)', season or '', 'season number', default=None)) + episode_number = int_or_none(self._search_regex( + r'Episode (\d+)', episode or '', 'episode number', default=None)) + return { 'id': video_id, 'title': title, + 'alt_title': alt_title, 'description': description, 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': unified_timestamp(video.get('lastPublished')), + 'series': series, + 'season': season, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, 'formats': formats, } From f4093b34f6ae8f8f3603b20270232950d52933ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 17 Nov 2020 03:02:07 +0700 Subject: [PATCH 24/87] [servus] Add support for pm-wissen.com (closes #25869) --- youtube_dl/extractor/servus.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/servus.py b/youtube_dl/extractor/servus.py index 206bc1801..1610ddc2c 100644 --- a/youtube_dl/extractor/servus.py +++ b/youtube_dl/extractor/servus.py @@ -18,7 +18,7 @@ class ServusIE(InfoExtractor): (?:www\.)? (?: servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| - servustv\.com/videos + (?:servustv|pm-wissen)\.com/videos ) /(?P<id>[aA]{2}-\w+|\d+-\d+) ''' @@ -55,6 +55,9 @@ class ServusIE(InfoExtractor): }, { 'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/', 'only_matching': True, + }, { + 'url': 'https://www.pm-wissen.com/videos/aa-24mus4g2w2112/', + 'only_matching': True, }] def _real_extract(self, url): From 11f3471c4be16d0f848c72a4b4915f5f81d4f337 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 17 Nov 2020 03:24:23 +0700 Subject: [PATCH 25/87] [ndr:embed:base] Extract subtitles (closes #25447, closes #26106) --- youtube_dl/extractor/ndr.py | 38 +++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 2447c812e..ddd828d92 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -81,6 +81,29 @@ class NDRIE(NDRBaseIE): 'params': { 'skip_download': True, }, + }, { + # with subtitles + 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html', + 'info_dict': { + 'id': 'extra18674', + 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring', + 'ext': 'mp4', + 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', + 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6', + 'uploader': 'ndrtv', + 'upload_date': '20201113', + 'duration': 1749, + 'subtitles': { + 'de': [{ + 'ext': 'ttml', + 'url': r're:^https://www\.ndr\.de.+', + }], + }, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html', 'only_matching': True, @@ -239,6 +262,20 @@ class NDREmbedBaseIE(InfoExtractor): 'preference': quality_key(thumbnail.get('quality')), }) + subtitles = {} + tracks = config.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if not isinstance(track, dict): + continue + track_url = urljoin(url, track.get('src')) + if not track_url: + continue + subtitles.setdefault(track.get('srclang') or 'de', []).append({ + 'url': track_url, + 'ext': 'ttml', + }) + return { 'id': video_id, 'title': title, @@ -248,6 +285,7 @@ class NDREmbedBaseIE(InfoExtractor): 'duration': duration, 'thumbnails': thumbnails, 'formats': formats, + 'subtitles': subtitles, } From 91dcde8a381d1c442e5b56fa1d3652fdd3f4496d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 16 Nov 2020 21:27:51 +0100 Subject: [PATCH 26/87] [lrt] fix extraction with empty tags(closes #20264) --- youtube_dl/extractor/lrt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index a89434adb..89d549858 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -61,7 +61,7 @@ class LRTIE(InfoExtractor): json_ld_data = self._search_json_ld(webpage, video_id) tags = [] - for tag in media.get('tags', []): + for tag in (media.get('tags') or []): tag_name = tag.get('name') if not tag_name: continue From 6699b6ce41ca04d1d8782b943dfacfa41e7102eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 17 Nov 2020 03:40:02 +0700 Subject: [PATCH 27/87] [youtube:tab] Fix extraction with cookies provided (closes #27005) --- youtube_dl/extractor/youtube.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 248682a41..22af03832 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2723,7 +2723,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'itct': ctp, } - def _entries(self, tab): + def _entries(self, tab, identity_token): continuation = None slr_contents = tab['sectionListRenderer']['contents'] for slr_content in slr_contents: @@ -2768,16 +2768,20 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if not continuation: continuation = self._extract_continuation(is_renderer) + headers = { + 'x-youtube-client-name': '1', + 'x-youtube-client-version': '2.20201112.04.01', + } + if identity_token: + headers['x-youtube-identity-token'] = identity_token + for page_num in itertools.count(1): if not continuation: break browse = self._download_json( 'https://www.youtube.com/browse_ajax', None, 'Downloading page %d' % page_num, - headers={ - 'x-youtube-client-name': '1', - 'x-youtube-client-version': '2.20201030.01.00', - }, query=continuation, fatal=False) + headers=headers, query=continuation, fatal=False) if not browse: break response = try_get(browse, lambda x: x[1]['response'], dict) @@ -2848,8 +2852,11 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): title = channel_title or channel_id if tab_title: title += ' - %s' % tab_title + identity_token = self._search_regex( + r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, + 'identity token', default=None) return self.playlist_result( - self._entries(selected_tab['content']), + self._entries(selected_tab['content'], identity_token), playlist_id=channel_external_id or channel_id, playlist_title=title) From e3cad6bd99f32ef5f146d803b60be182d34a83b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 17 Nov 2020 03:50:12 +0700 Subject: [PATCH 28/87] [ChangeLog] Actualize [ci skip] --- ChangeLog | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ChangeLog b/ChangeLog index 1ef7ea7b6..65aa524a1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +version <unreleased> + +Core +* [utils] Skip ! prefixed code in js_to_json + +Extractors +* [youtube:tab] Fix extraction with cookies provided (#27005) +* [lrt] Fix extraction with empty tags (#20264) ++ [ndr:embed:base] Extract subtitles (#25447, #26106) ++ [servus] Add support for pm-wissen.com (#25869) +* [servus] Fix extraction (#26872, #26967, #26983, #27000) +* [xtube] Fix extraction (#26996) +* [lrt] Fix extraction ++ [lbry] Add support for lbry.tv ++ [condenast] Extract subtitles +* [condenast] Fix extraction +* [bandcamp] Fix extraction (#26681, #26684) +* [rai] Fix RaiPlay extraction (#26064, #26096) +* [vlive] Fix extraction +* [usanetwork] Fix extraction +* [nbc] Fix NBCNews/Today/MSNBC extraction +* [cnbc] Fix extraction + + version 2020.11.12 Extractors From b92e95aa0159f114f1066fbbefed128d8c850c60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 17 Nov 2020 03:59:54 +0700 Subject: [PATCH 29/87] release 2020.11.17 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 2 +- youtube_dl/version.py | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index a10c9fd83..80baffa2a 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.12. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.17. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.11.12** +- [ ] I've verified that I'm running youtube-dl version **2020.11.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.12 + [debug] youtube-dl version 2020.11.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 9cc120d3e..ee4215296 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.12. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.17. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.11.12** +- [ ] I've verified that I'm running youtube-dl version **2020.11.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 29bd5f5ac..4c3834fa5 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.12. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.17. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.12** +- [ ] I've verified that I'm running youtube-dl version **2020.11.17** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index cc33a993f..3ad3e7409 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.12. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.17. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.11.12** +- [ ] I've verified that I'm running youtube-dl version **2020.11.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.12 + [debug] youtube-dl version 2020.11.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index cd577ecef..aabbfe83c 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.12. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.17. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.12** +- [ ] I've verified that I'm running youtube-dl version **2020.11.17** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 65aa524a1..254d0ef1b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2020.11.17 Core * [utils] Skip ! prefixed code in js_to_json diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0c77d017e..86b5ad726 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -417,6 +417,7 @@ - **la7.it** - **laola1tv** - **laola1tv:embed** + - **lbry.tv** - **LCI** - **Lcp** - **LcpPlay** @@ -1042,7 +1043,6 @@ - **vk:wallpost** - **vlive** - **vlive:channel** - - **vlive:playlist** - **Vodlocker** - **VODPl** - **VODPlatform** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 04cd207ab..ed18392a0 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.11.12' +__version__ = '2020.11.17' From 7d509c613ba66492ede723188d4254bb1427f4a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 17 Nov 2020 04:28:35 +0700 Subject: [PATCH 30/87] [youtube] Fix chapters extraction (closes #26005) --- youtube_dl/extractor/youtube.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 22af03832..1a395b6e1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1465,21 +1465,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_chapters_from_json(self, webpage, video_id, duration): if not webpage: return - player = self._parse_json( - self._search_regex( - r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage, - 'player args', default='{}'), - video_id, fatal=False) - if not player or not isinstance(player, dict): - return - watch_next_response = player.get('watch_next_response') - if not isinstance(watch_next_response, compat_str): - return - response = self._parse_json(watch_next_response, video_id, fatal=False) - if not response or not isinstance(response, dict): + data = self._extract_yt_initial_data(video_id, webpage) + if not data or not isinstance(data, dict): return chapters_list = try_get( - response, + data, lambda x: x['playerOverlays'] ['playerOverlayRenderer'] ['decoratedPlayerBarRenderer'] From f8c749f12c3b88fc97b4d2a2d9934483589b50a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 17 Nov 2020 07:01:41 +0700 Subject: [PATCH 31/87] [youtube:tab] Fix playlist title extraction (closes #27015) --- youtube_dl/extractor/youtube.py | 36 +++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1a395b6e1..9333e48e4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2825,30 +2825,36 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): raise ExtractorError('Unable to find selected tab') def _real_extract(self, url): - channel_id = self._match_id(url) + item_id = self._match_id(url) url = compat_urlparse.urlunparse( compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) - webpage = self._download_webpage(url, channel_id) - data = self._extract_yt_initial_data(channel_id, webpage) + webpage = self._download_webpage(url, item_id) + data = self._extract_yt_initial_data(item_id, webpage) tabs = data['contents']['twoColumnBrowseResultsRenderer']['tabs'] selected_tab = self._extract_selected_tab(tabs) - channel_title = try_get( - data, lambda x: x['metadata']['channelMetadataRenderer']['title'], - compat_str) - channel_external_id = try_get( - data, lambda x: x['metadata']['channelMetadataRenderer']['externalId'], - compat_str) - tab_title = selected_tab.get('title') - title = channel_title or channel_id - if tab_title: - title += ' - %s' % tab_title + renderer = try_get( + data, lambda x: x['metadata']['channelMetadataRenderer'], dict) + if renderer: + channel_title = renderer.get('title') or item_id + tab_title = selected_tab.get('title') + title = channel_title or item_id + if tab_title: + title += ' - %s' % tab_title + description = renderer.get('description') + playlist_id = renderer.get('externalId') + renderer = try_get( + data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) + if renderer: + title = renderer.get('title') + description = None + playlist_id = item_id identity_token = self._search_regex( r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, 'identity token', default=None) return self.playlist_result( self._entries(selected_tab['content'], identity_token), - playlist_id=channel_external_id or channel_id, - playlist_title=title) + playlist_id=playlist_id, playlist_title=title, + playlist_description=description) class YoutubePlaylistIE(InfoExtractor): From 5b867c15a8443c79e1521053c19db3ae5f679625 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 17 Nov 2020 13:11:35 +0100 Subject: [PATCH 32/87] [urplay] fix extraction(closes #26828) --- youtube_dl/extractor/urplay.py | 77 ++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py index 6030b7cb5..10b817760 100644 --- a/youtube_dl/extractor/urplay.py +++ b/youtube_dl/extractor/urplay.py @@ -2,7 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_timestamp +from ..utils import ( + dict_get, + int_or_none, + unified_timestamp, +) class URPlayIE(InfoExtractor): @@ -15,8 +19,8 @@ class URPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd', 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', - 'timestamp': 1513512768, - 'upload_date': '20171217', + 'timestamp': 1513292400, + 'upload_date': '20171214', }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -25,7 +29,7 @@ class URPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'Tripp, Trapp, Träd : Sovkudde', 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', - 'timestamp': 1440093600, + 'timestamp': 1440086400, 'upload_date': '20150820', }, }, { @@ -35,37 +39,58 @@ class URPlayIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - + url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) - urplayer_data = self._parse_json(self._search_regex( - r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id) - host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] + urplayer_data = self._parse_json(self._html_search_regex( + r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['currentProduct'] + episode = urplayer_data['title'] + raw_streaming_info = urplayer_data['streamingInfo']['raw'] + host = self._download_json( + 'http://streaming-loadbalancer.ur.se/loadbalancer.json', + video_id)['redirect'] formats = [] - for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)): - file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr) + for k, v in raw_streaming_info.items(): + if not (k in ('sd', 'hd') and isinstance(v, dict)): + continue + file_http = v.get('location') if file_http: formats.extend(self._extract_wowza_formats( - 'http://%s/%splaylist.m3u8' % (host, file_http), video_id, skip_protocols=['rtmp', 'rtsp'])) + 'http://%s/%splaylist.m3u8' % (host, file_http), + video_id, skip_protocols=['f4m', 'rtmp', 'rtsp'])) self._sort_formats(formats) - subtitles = {} - for subtitle in urplayer_data.get('subtitles', []): - subtitle_url = subtitle.get('file') - kind = subtitle.get('kind') - if not subtitle_url or (kind and kind != 'captions'): - continue - subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({ - 'url': subtitle_url, - }) + image = urplayer_data.get('image') or {} + thumbnails = [] + for k, v in image.items(): + t = { + 'id': k, + 'url': v, + } + wh = k.split('x') + if len(wh) == 2: + t.update({ + 'width': int_or_none(wh[0]), + 'height': int_or_none(wh[1]), + }) + thumbnails.append(t) + + series = urplayer_data.get('series') or {} + series_title = dict_get(series, ('seriesTitle', 'title')) or dict_get(urplayer_data, ('seriesTitle', 'mainTitle')) return { 'id': video_id, - 'title': urplayer_data['title'], - 'description': self._og_search_description(webpage), - 'thumbnail': urplayer_data.get('image'), - 'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), webpage, 'timestamp')), - 'series': urplayer_data.get('series_title'), - 'subtitles': subtitles, + 'title': '%s : %s' % (series_title, episode) if series_title else episode, + 'description': urplayer_data.get('description'), + 'thumbnails': thumbnails, + 'timestamp': unified_timestamp(urplayer_data.get('publishedAt')), + 'series': series_title, 'formats': formats, + 'duration': int_or_none(urplayer_data.get('duration')), + 'categories': urplayer_data.get('categories'), + 'tags': urplayer_data.get('keywords'), + 'season': series.get('label'), + 'episode': episode, + 'episode_number': int_or_none(urplayer_data.get('episodeNumber')), } From cb2b9a22a5a53dd63f26db7509f4438a19261e36 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 17 Nov 2020 14:46:02 +0100 Subject: [PATCH 33/87] [bandcamp] extract playlist_description(closes #22684) --- youtube_dl/extractor/bandcamp.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 731c7c25c..69e673a26 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -270,6 +270,7 @@ class BandcampAlbumIE(BandcampIE): 'title': '"Entropy" EP', 'uploader_id': 'jstrecords', 'id': 'entropy-ep', + 'description': 'md5:0ff22959c943622972596062f2f366a5', }, 'playlist_mincount': 3, }, { @@ -279,6 +280,7 @@ class BandcampAlbumIE(BandcampIE): 'id': 'we-are-the-plague', 'title': 'WE ARE THE PLAGUE', 'uploader_id': 'insulters', + 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f', }, 'playlist_count': 2, }] @@ -305,11 +307,14 @@ class BandcampAlbumIE(BandcampIE): for t in track_info if t.get('duration')] + current = tralbum.get('current') or {} + return { '_type': 'playlist', 'uploader_id': uploader_id, 'id': playlist_id, - 'title': try_get(tralbum, lambda x: x['current']['title'], compat_str), + 'title': current.get('title'), + 'description': current.get('about'), 'entries': entries, } From aa613ef7e1efe9f799a1209659f8d9d01e3de221 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 17 Nov 2020 19:13:38 +0100 Subject: [PATCH 34/87] [malltv] fix extraction(closes #27035) --- youtube_dl/extractor/malltv.py | 60 ++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/malltv.py b/youtube_dl/extractor/malltv.py index 6f4fd927f..fadfd9338 100644 --- a/youtube_dl/extractor/malltv.py +++ b/youtube_dl/extractor/malltv.py @@ -1,10 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import merge_dicts +from ..utils import ( + clean_html, + dict_get, + float_or_none, + int_or_none, + merge_dicts, + parse_duration, + try_get, +) class MallTVIE(InfoExtractor): @@ -17,7 +23,7 @@ class MallTVIE(InfoExtractor): 'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', 'ext': 'mp4', 'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?', - 'description': 'md5:25fc0ec42a72ba602b602c683fa29deb', + 'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35', 'duration': 216, 'timestamp': 1538870400, 'upload_date': '20181007', @@ -37,20 +43,46 @@ class MallTVIE(InfoExtractor): webpage = self._download_webpage( url, display_id, headers=self.geo_verification_headers()) - SOURCE_RE = r'(<source[^>]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P<id>[\da-z]+)/index)\b' + video = self._parse_json(self._search_regex( + r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);', + webpage, 'video object'), display_id) + video_source = video['VideoSource'] video_id = self._search_regex( - SOURCE_RE, webpage, 'video id', group='id') + r'/([\da-z]+)/index\b', video_source, 'video id') - media = self._parse_html5_media_entries( - url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id, - m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0] + formats = self._extract_m3u8_formats( + video_source + '.m3u8', video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + + subtitles = {} + for s in (video.get('Subtitles') or {}): + s_url = s.get('Url') + if not s_url: + continue + subtitles.setdefault(s.get('Language') or 'cz', []).append({ + 'url': s_url, + }) + + entity_counts = video.get('EntityCounts') or {} + + def get_count(k): + v = entity_counts.get(k + 's') or {} + return int_or_none(dict_get(v, ('Count', 'StrCount'))) info = self._search_json_ld(webpage, video_id, default={}) - return merge_dicts(media, info, { + return merge_dicts({ 'id': video_id, 'display_id': display_id, - 'title': self._og_search_title(webpage, default=None) or display_id, - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - }) + 'title': video.get('Title'), + 'description': clean_html(video.get('Description')), + 'thumbnail': video.get('ThumbnailUrl'), + 'formats': formats, + 'subtitles': subtitles, + 'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')), + 'view_count': get_count('View'), + 'like_count': get_count('Like'), + 'dislike_count': get_count('Dislike'), + 'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])), + 'comment_count': get_count('Comment'), + }, info) From 284f8306dffb3dbeeca3f99ef4c32ed4fcd571c3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 17 Nov 2020 20:32:50 +0100 Subject: [PATCH 35/87] [youtube:tab] fix view_count extraction(closes #27051) --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9333e48e4..4089e2aba 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2543,8 +2543,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): renderer, lambda x: x['lengthText']['simpleText'], compat_str)) view_count_text = try_get( renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or '' - view_count = int_or_none(self._search_regex( - r'^(\d+)', re.sub(r'\s', '', view_count_text), + view_count = str_to_int(self._search_regex( + r'^([\d,]+)', re.sub(r'\s', '', view_count_text), 'view count', default=None)) uploader = try_get( renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str) From 2864179293ba16189544b28356647886960a48fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 18 Nov 2020 03:32:42 +0700 Subject: [PATCH 36/87] [youtube] Improve extraction + Add support for --no-playlist (closes #27009) * Improve playlist and mix extraction (closes #26390, closes #26509, closes #26534, closes #27011) + Extract playlist uploader data * Update tests --- test/test_all_urls.py | 9 +- youtube_dl/extractor/youtube.py | 229 +++++++++++++++++++------------- 2 files changed, 142 insertions(+), 96 deletions(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 348744028..56a08bed8 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -31,16 +31,17 @@ class TestAllURLsMatching(unittest.TestCase): def test_youtube_playlist_matching(self): assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist']) + assertTab = lambda url: self.assertMatch(url, ['youtube:tab']) assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585 assertPlaylist('PL63F0C78739B09958') - # assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') + assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') - # assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') - assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 + assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') + assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M')) # Top tracks - # assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101') + assertTab('https://www.youtube.com/playlist?list=MCUS.20142101') def test_youtube_matching(self): self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M')) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4089e2aba..79f87aa85 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -45,6 +45,7 @@ from ..utils import ( unescapeHTML, unified_strdate, unsmuggle_url, + update_url_query, uppercase_escape, url_or_none, urlencode_postdata, @@ -65,7 +66,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}' + _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' _YOUTUBE_CLIENT_HEADERS = { 'x-youtube-client-name': '1', @@ -974,10 +975,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'sJL6WA-aGkQ', 'only_matching': True, }, - { - 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', - 'only_matching': True, - }, { 'url': 'https://invidio.us/watch?v=BaW_jenozKc', 'only_matching': True, @@ -2351,7 +2348,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): class YoutubeTabIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com tab' - _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|playlist\?.*?\blist=)(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|(?:playlist|watch)\?.*?\blist=)(?P<id>[^/?#&]+)' IE_NAME = 'youtube:tab' _TESTS = [{ @@ -2361,6 +2358,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', 'title': 'Игорь Клейнер - Playlists', + 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', }, }, { # playlists, multipage, different order @@ -2369,14 +2367,16 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', 'title': 'Игорь Клейнер - Playlists', + 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', }, }, { # playlists, singlepage 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', 'playlist_mincount': 4, 'info_dict': { - 'id': 'ThirstForScience', - 'title': 'ThirstForScience', + 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', + 'title': 'ThirstForScience - Playlists', + 'description': 'md5:609399d937ea957b0f53cbffb747a14c', } }, { 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', @@ -2407,6 +2407,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Home', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', }, 'playlist_mincount': 2, }, { @@ -2415,6 +2416,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', }, 'playlist_mincount': 975, }, { @@ -2423,6 +2425,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', }, 'playlist_mincount': 199, }, { @@ -2431,6 +2434,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Playlists', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', }, 'playlist_mincount': 17, }, { @@ -2439,6 +2443,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Community', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', }, 'playlist_mincount': 18, }, { @@ -2447,6 +2452,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Channels', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', }, 'playlist_mincount': 138, }, { @@ -2465,7 +2471,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': '29C3: Not my department', 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'uploader': 'Christiaan008', - 'uploader_id': 'ChRiStIaAn008', + 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg', }, 'playlist_count': 96, }, { @@ -2475,7 +2481,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': 'Uploads from Cauchemar', 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', 'uploader': 'Cauchemar', - 'uploader_id': 'Cauchemar89', + 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', }, 'playlist_mincount': 1123, }, { @@ -2489,7 +2495,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': 'Uploads from Interstellar Movie', 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', 'uploader': 'Interstellar Movie', - 'uploader_id': 'InterstellarMovie1', + 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA', }, 'playlist_mincount': 21, }, { @@ -2498,13 +2504,43 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'title': 'Data Analysis with Dr Mike Pound', 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'uploader_id': 'Computerphile', + 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', 'uploader': 'Computerphile', }, 'playlist_mincount': 11, }, { 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', 'only_matching': True, + }, { + # Playlist URL that does not actually serve a playlist + 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', + 'info_dict': { + 'id': 'FqZTN594JQw', + 'ext': 'webm', + 'title': "Smiley's People 01 detective, Adventure Series, Action", + 'uploader': 'STREEM', + 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', + 'upload_date': '20150526', + 'license': 'Standard YouTube License', + 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', + 'categories': ['People & Blogs'], + 'tags': list, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video is not available.', + 'add_ie': [YoutubeIE.ie_key()], + }, { + 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', + 'only_matching': True, }] @classmethod @@ -2535,7 +2571,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): def _extract_video(self, renderer): video_id = renderer.get('videoId') title = try_get( - renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + renderer, + (lambda x: x['title']['runs'][0]['text'], + lambda x: x['title']['simpleText']), compat_str) description = try_get( renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str) @@ -2615,7 +2653,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): for content in video_list_renderer['contents']: if not isinstance(content, dict): continue - renderer = content.get('playlistVideoRenderer') + renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer') if not isinstance(renderer, dict): continue video_id = renderer.get('videoId') @@ -2715,7 +2753,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): def _entries(self, tab, identity_token): continuation = None - slr_contents = tab['sectionListRenderer']['contents'] + slr_contents = try_get(tab, lambda x: x['sectionListRenderer']['contents'], list) or [] for slr_content in slr_contents: if not isinstance(slr_content, dict): continue @@ -2824,13 +2862,30 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): else: raise ExtractorError('Unable to find selected tab') - def _real_extract(self, url): - item_id = self._match_id(url) - url = compat_urlparse.urlunparse( - compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) - webpage = self._download_webpage(url, item_id) - data = self._extract_yt_initial_data(item_id, webpage) - tabs = data['contents']['twoColumnBrowseResultsRenderer']['tabs'] + @staticmethod + def _extract_uploader(data): + uploader = {} + sidebar_renderer = try_get( + data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) + if sidebar_renderer: + for item in sidebar_renderer: + if not isinstance(item, dict): + continue + renderer = item.get('playlistSidebarSecondaryInfoRenderer') + if not isinstance(renderer, dict): + continue + owner = try_get( + renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict) + if owner: + uploader['uploader'] = owner.get('text') + uploader['uploader_id'] = try_get( + owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str) + uploader['uploader_url'] = urljoin( + 'https://www.youtube.com/', + try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str)) + return uploader + + def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): selected_tab = self._extract_selected_tab(tabs) renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) @@ -2848,42 +2903,69 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): title = renderer.get('title') description = None playlist_id = item_id - identity_token = self._search_regex( - r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, - 'identity token', default=None) - return self.playlist_result( + playlist = self.playlist_result( self._entries(selected_tab['content'], identity_token), playlist_id=playlist_id, playlist_title=title, playlist_description=description) + playlist.update(self._extract_uploader(data)) + return playlist + + def _extract_from_playlist(self, item_id, data, playlist): + title = playlist.get('title') or try_get( + data, lambda x: x['titleText']['simpleText'], compat_str) + playlist_id = playlist.get('playlistId') or item_id + return self.playlist_result( + self._playlist_entries(playlist), playlist_id=playlist_id, + playlist_title=title) + + def _real_extract(self, url): + item_id = self._match_id(url) + url = compat_urlparse.urlunparse( + compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) + # Handle both video/playlist URLs + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + video_id = qs.get('v', [None])[0] + playlist_id = qs.get('list', [None])[0] + if video_id and playlist_id: + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + webpage = self._download_webpage(url, item_id) + identity_token = self._search_regex( + r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, + 'identity token', default=None) + data = self._extract_yt_initial_data(item_id, webpage) + tabs = try_get( + data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) + if tabs: + return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token) + playlist = try_get( + data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) + if playlist: + return self._extract_from_playlist(item_id, data, playlist) + # Fallback to video extraction if no playlist alike page is recognized + if video_id: + return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) + # Failed to recognize + raise ExtractorError('Unable to recognize tab page') class YoutubePlaylistIE(InfoExtractor): IE_DESC = 'YouTube.com playlists' - _VALID_URL = r"""(?x)(?: + _VALID_URL = r'''(?x)(?: (?:https?://)? (?:\w+\.)? (?: (?: youtube(?:kids)?\.com| - invidio\.us + invidio\.us| + youtu\.be ) - / - (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) - \? (?:.*?[&;])*? (?:p|a|list)= - | p/ - )| - youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= - ) - ( - (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,} - # Top tracks, they can also include dots - |(?:MC)[\w\.]* - ) - .* - | - (%(playlist_id)s) - )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} + /.*?\?.*?\blist= + )? + (?P<id>%(playlist_id)s) + )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} IE_NAME = 'youtube:playlist' _TESTS = [{ 'note': 'issue #673', @@ -2892,7 +2974,7 @@ class YoutubePlaylistIE(InfoExtractor): 'title': '[OLD]Team Fortress 2 (Class-based LP)', 'id': 'PLBB231211A4F62143', 'uploader': 'Wickydoo', - 'uploader_id': 'Wickydoo', + 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', }, 'playlist_mincount': 29, }, { @@ -2920,41 +3002,8 @@ class YoutubePlaylistIE(InfoExtractor): 'title': '2018 Chinese New Singles (11/6 updated)', 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', 'uploader': 'LBK', - 'uploader_id': 'sdragonfang', + 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA', } - }, { - 'note': 'Embedded SWF player', - 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', - 'playlist_count': 4, - 'info_dict': { - 'title': 'JODA7', - 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ', - }, - 'skip': 'This playlist does not exist', - }, { - # Playlist URL that does not actually serve a playlist - 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', - 'info_dict': { - 'id': 'FqZTN594JQw', - 'ext': 'webm', - 'title': "Smiley's People 01 detective, Adventure Series, Action", - 'uploader': 'STREEM', - 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', - 'upload_date': '20150526', - 'license': 'Standard YouTube License', - 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', - 'categories': ['People & Blogs'], - 'tags': list, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - 'add_ie': [YoutubeIE.ie_key()], }, { 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', 'info_dict': { @@ -2985,9 +3034,6 @@ class YoutubePlaylistIE(InfoExtractor): # music album playlist 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', 'only_matching': True, - }, { - 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', - 'only_matching': True, }] @classmethod @@ -2996,13 +3042,12 @@ class YoutubePlaylistIE(InfoExtractor): YoutubePlaylistIE, cls).suitable(url) def _real_extract(self, url): - # Extract playlist id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - playlist_id = mobj.group(1) or mobj.group(2) + playlist_id = self._match_id(url) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + if not qs: + qs = {'list': playlist_id} return self.url_result( - 'https://www.youtube.com/playlist?list=%s' % playlist_id, + update_url_query('https://www.youtube.com/playlist', qs), ie=YoutubeTabIE.ie_key(), video_id=playlist_id) @@ -3250,13 +3295,13 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): class YoutubeWatchLaterIE(InfoExtractor): IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater' _TESTS = [{ - 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL', + 'url': 'https://www.youtube.com/feed/watch_later', 'only_matching': True, }, { - 'url': 'https://www.youtube.com/feed/watch_later', + 'url': ':ytwatchlater', 'only_matching': True, }] From 2d7a29081c7419e4148cd0829b4f68a608c78496 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 18 Nov 2020 04:09:02 +0700 Subject: [PATCH 37/87] [spiegel] Fix extraction (closes #24206, closes #24767) Code picked from PR #24767 since original repo is not available due to takedown. --- youtube_dl/extractor/extractors.py | 3 +- youtube_dl/extractor/spiegel.py | 161 +++++------------------------ youtube_dl/extractor/spiegeltv.py | 17 --- 3 files changed, 29 insertions(+), 152 deletions(-) delete mode 100644 youtube_dl/extractor/spiegeltv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b2baf8057..11ef47261 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1054,8 +1054,7 @@ from .spankbang import ( SpankBangPlaylistIE, ) from .spankwire import SpankwireIE -from .spiegel import SpiegelIE, SpiegelArticleIE -from .spiegeltv import SpiegeltvIE +from .spiegel import SpiegelIE from .spike import ( BellatorIE, ParamountNetworkIE, diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 4df7f4ddc..2da32b9b2 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -1,159 +1,54 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .spiegeltv import SpiegeltvIE -from ..compat import compat_urlparse -from ..utils import ( - parse_duration, - strip_or_none, - unified_timestamp, -) +from .jwplatform import JWPlatformIE class SpiegelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', - 'md5': 'b57399839d055fccfeb9a0455c439868', + 'md5': '50c7948883ec85a3e431a0a44b7ad1d6', 'info_dict': { - 'id': '563747', + 'id': 'II0BUyxY', + 'display_id': '1259285', 'ext': 'mp4', - 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', + 'title': 'Vulkan Tungurahua in Ecuador ist wieder aktiv - DER SPIEGEL - Wissenschaft', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', - 'duration': 49, + 'duration': 48.0, 'upload_date': '20130311', - 'timestamp': 1362994320, + 'timestamp': 1362997920, }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', - 'md5': '5b6c2f4add9d62912ed5fc78a1faed80', - 'info_dict': { - 'id': '580988', - 'ext': 'mp4', - 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', - 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', - 'duration': 983, - 'upload_date': '20131115', - 'timestamp': 1384546642, - }, - }, { - 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', - 'md5': '97b91083a672d72976faa8433430afb9', - 'info_dict': { - 'id': '601883', - 'ext': 'mp4', - 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', - 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', - 'upload_date': '20140904', - 'timestamp': 1409834160, - } - }, { - 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', 'only_matching': True, }, { - # nexx video + 'url': 'https://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html', + 'only_matching': True, + }, { + 'url': 'https://www.spiegel.de/panorama/urteile-im-goldmuenzenprozess-haftstrafen-fuer-clanmitglieder-a-aae8df48-43c1-4c61-867d-23f0a2d254b7', + 'only_matching': True, + }, { 'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html', 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id - handle = self._request_webpage(metadata_url, video_id) - - # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html - if SpiegeltvIE.suitable(handle.geturl()): - return self.url_result(handle.geturl(), 'Spiegeltv') - - video_data = self._parse_json(self._webpage_read_content( - handle, metadata_url, video_id), video_id) - title = video_data['title'] - nexx_id = video_data['nexxOmniaId'] - domain_id = video_data.get('nexxOmniaDomain') or '748' - - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': 'nexx:%s:%s' % (domain_id, nexx_id), - 'title': title, - 'description': strip_or_none(video_data.get('teaser')), - 'duration': parse_duration(video_data.get('duration')), - 'timestamp': unified_timestamp(video_data.get('datum')), - 'ie_key': NexxIE.ie_key(), - } - - -class SpiegelArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html' - IE_NAME = 'Spiegel:Article' - IE_DESC = 'Articles on spiegel.de' - _TESTS = [{ + }, { 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html', - 'info_dict': { - 'id': '1516455', - 'ext': 'mp4', - 'title': 'Faszination Badminton: Nennt es bloß nicht Federball', - 'description': 're:^Patrick Kämnitz gehört.{100,}', - 'upload_date': '20140825', - }, - }, { - 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', - 'info_dict': { - - }, - 'playlist_count': 6, - }, { - # Nexx iFrame embed - 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html', - 'info_dict': { - 'id': '161464', - 'ext': 'mp4', - 'title': 'Nervenkitzel Achterbahn', - 'alt_title': 'Karussellbauer in Deutschland', - 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', - 'release_year': 2005, - 'creator': 'SPIEGEL TV', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2761, - 'timestamp': 1394021479, - 'upload_date': '20140305', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - - # Single video on top of the page - video_link = self._search_regex( - r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage, - 'video page URL', default=None) - if video_link: - video_url = compat_urlparse.urljoin( - self.http_scheme() + '//spiegel.de/', video_link) - return self.url_result(video_url) - - # Multiple embedded videos - embeds = re.findall( - r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"', - webpage) - entries = [ - self.url_result(compat_urlparse.urljoin( - self.http_scheme() + '//spiegel.de/', embed_path)) - for embed_path in embeds] - if embeds: - return self.playlist_result(entries) - - return self.playlist_from_matches( - NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key()) + media_id = self._html_search_regex( + r'("|["\'])mediaId\1\s*:\s*("|["\'])(?P<id>(?:(?!\2).)+)\2', + webpage, 'media id', group='id') + return { + '_type': 'url_transparent', + 'id': video_id, + 'display_id': video_id, + 'url': 'jwplatform:%s' % media_id, + 'title': self._og_search_title(webpage, default=None), + 'ie_key': JWPlatformIE.ie_key(), + } diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py deleted file mode 100644 index 6ccf4c342..000000000 --- a/youtube_dl/extractor/spiegeltv.py +++ /dev/null @@ -1,17 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from .nexx import NexxIE - - -class SpiegeltvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)' - _TEST = { - 'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/', - 'only_matching': True, - } - - def _real_extract(self, url): - return self.url_result( - 'https://api.nexx.cloud/v3/748/videos/byid/%s' - % self._match_id(url), ie=NexxIE.ie_key()) From bb2b89e077bf3d105b89d48dc523318a87763e4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 18 Nov 2020 04:11:58 +0700 Subject: [PATCH 38/87] [ChangeLog] Actualize [ci skip] --- ChangeLog | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ChangeLog b/ChangeLog index 254d0ef1b..6864bbef3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +version <unreleased> + +Extractors +* [spiegel] Fix extraction (#24206, #24767) +* [youtube] Improve extraction + + Add support for --no-playlist (#27009) + * Improve playlist and mix extraction (#26390, #26509, #26534, #27011) + + Extract playlist uploader data +* [youtube:tab] Fix view count extraction (#27051) +* [malltv] Fix extraction (#27035) ++ [bandcamp] Extract playlist description (#22684) +* [urplay] Fix extraction (#26828) +* [youtube:tab] Fix playlist title extraction (#27015) +* [youtube] Fix chapters extraction (#26005) + + version 2020.11.17 Core From 9360936f26cba46c0ccfb668fc463cda3226e151 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 18 Nov 2020 04:15:30 +0700 Subject: [PATCH 39/87] release 2020.11.18 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 2 -- youtube_dl/version.py | 2 +- 8 files changed, 14 insertions(+), 16 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 80baffa2a..64a2eb736 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.17. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.18. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.11.17** +- [ ] I've verified that I'm running youtube-dl version **2020.11.18** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.17 + [debug] youtube-dl version 2020.11.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index ee4215296..c0c789673 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.17. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.18. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.11.17** +- [ ] I've verified that I'm running youtube-dl version **2020.11.18** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 4c3834fa5..c90dbf30c 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.17. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.18. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.17** +- [ ] I've verified that I'm running youtube-dl version **2020.11.18** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 3ad3e7409..4c9d295de 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.17. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.18. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.11.17** +- [ ] I've verified that I'm running youtube-dl version **2020.11.18** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.17 + [debug] youtube-dl version 2020.11.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index aabbfe83c..a6040daee 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.17. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.18. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.17** +- [ ] I've verified that I'm running youtube-dl version **2020.11.18** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 6864bbef3..4d404a56e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2020.11.18 Extractors * [spiegel] Fix extraction (#24206, #24767) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 86b5ad726..9f0cd6ff6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -824,8 +824,6 @@ - **SpankBangPlaylist** - **Spankwire** - **Spiegel** - - **Spiegel:Article**: Articles on spiegel.de - - **Spiegeltv** - **sport.francetvinfo.fr** - **Sport5** - **SportBox** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ed18392a0..d4c6b936f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.11.17' +__version__ = '2020.11.18' From 8a6c5b080600465e08f5504f147c9042211c123b Mon Sep 17 00:00:00 2001 From: gdzx <6490707+gdzx@users.noreply.github.com> Date: Tue, 17 Nov 2020 23:06:19 +0100 Subject: [PATCH 40/87] [francetv] Add fallback video url extraction (#27047) Fallback on another API endpoint when no video formats are found. Closes ytdl-org#22561 --- youtube_dl/extractor/francetv.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 81b468c7d..d776cf1dd 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -128,17 +128,37 @@ class FranceTVIE(InfoExtractor): is_live = None - formats = [] + videos = [] + for video in info['videos']: if video['statut'] != 'ONLINE': continue + if not video['url']: + continue + videos.append(video) + + if not videos: + for device_type in ['desktop', 'mobile']: + fallback_info = self._download_json( + 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, + video_id, 'Downloading fallback %s video JSON' % device_type, query={ + 'device_type': device_type, + 'browser': 'chrome', + }, fatal=False) + + if fallback_info and fallback_info.get('video'): + videos.append(fallback_info['video']) + + formats = [] + for video in videos: video_url = video['url'] if not video_url: continue if is_live is None: - is_live = (try_get( - video, lambda x: x['plages_ouverture'][0]['direct'], - bool) is True) or '/live.francetv.fr/' in video_url + is_live = ((try_get( + video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True) + or video.get('is_live') is True + or '/live.francetv.fr/' in video_url) format_id = video['format'] ext = determine_ext(video_url) if ext == 'f4m': @@ -154,6 +174,9 @@ class FranceTVIE(InfoExtractor): sign(video_url, format_id), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False)) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, @@ -166,6 +189,7 @@ class FranceTVIE(InfoExtractor): 'url': video_url, 'format_id': format_id, }) + self._sort_formats(formats) title = info['titre'] From d65628ef037c5c927cf2990d91f30cc222724171 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 17 Nov 2020 23:16:04 +0100 Subject: [PATCH 41/87] [francetv] improve info extraction --- youtube_dl/extractor/francetv.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index d776cf1dd..92f0851e3 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -17,6 +17,7 @@ from ..utils import ( parse_duration, try_get, url_or_none, + urljoin, ) from .dailymotion import DailymotionIE @@ -130,10 +131,10 @@ class FranceTVIE(InfoExtractor): videos = [] - for video in info['videos']: - if video['statut'] != 'ONLINE': + for video in (info.get('videos') or []): + if video.get('statut') != 'ONLINE': continue - if not video['url']: + if not video.get('url'): continue videos.append(video) @@ -151,15 +152,15 @@ class FranceTVIE(InfoExtractor): formats = [] for video in videos: - video_url = video['url'] + video_url = video.get('url') if not video_url: continue if is_live is None: - is_live = ((try_get( - video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True) + is_live = (try_get( + video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True or video.get('is_live') is True or '/live.francetv.fr/' in video_url) - format_id = video['format'] + format_id = video.get('format') ext = determine_ext(video_url) if ext == 'f4m': if georestricted: @@ -209,10 +210,10 @@ class FranceTVIE(InfoExtractor): return { 'id': video_id, 'title': self._live_title(title) if is_live else title, - 'description': clean_html(info['synopsis']), - 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']), - 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), - 'timestamp': int_or_none(info['diffusion']['timestamp']), + 'description': clean_html(info.get('synopsis')), + 'thumbnail': urljoin('http://pluzz.francetv.fr', info.get('image')), + 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')), + 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])), 'is_live': is_live, 'formats': formats, 'subtitles': subtitles, From 5c3f7014efe75ae4c4e1c8aa18a062b8442c917a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 18 Nov 2020 00:41:07 +0100 Subject: [PATCH 42/87] [lbry] add support for odysee.com domain(closes #26806) --- youtube_dl/extractor/lbry.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index 587deac90..0a7ee919c 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -16,7 +16,7 @@ from ..utils import ( class LBRYIE(InfoExtractor): IE_NAME = 'lbry.tv' - _VALID_URL = r'https?://(?:www\.)?lbry\.tv/(?P<id>@[0-9a-zA-Z-]+:[0-9a-z]+/[0-9a-zA-Z().-]+:[0-9a-z])' + _VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[0-9a-zA-Z-]+:[0-9a-z]+/[0-9a-zA-Z().-]+:[0-9a-z])' _TESTS = [{ # Video 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', @@ -41,6 +41,9 @@ class LBRYIE(InfoExtractor): 'timestamp': 1591312601, 'upload_date': '20200604', } + }, { + 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', + 'only_matching': True, }] def _call_api_proxy(self, method, display_id, params): From c7178f0f7af1d12cdaca5d1328adba5300c7436a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 18 Nov 2020 23:31:35 +0700 Subject: [PATCH 43/87] [extractor/common] Output error for invalid URLs in _is_valid_url (refs #21400, refs #24151, refs #25617, refs #25618, refs #25586, refs #26068, refs #27072) --- youtube_dl/extractor/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 021945a89..0c9089674 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1456,9 +1456,10 @@ class InfoExtractor(object): try: self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) return True - except ExtractorError: + except ExtractorError as e: self.to_screen( - '%s: %s URL is invalid, skipping' % (video_id, item)) + '%s: %s URL is invalid, skipping: %s' + % (video_id, item, error_to_compat_str(e.cause))) return False def http_scheme(self): From 444a68e0ec928a6dd01170362e092e41d69e3781 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 18 Nov 2020 21:06:33 +0100 Subject: [PATCH 44/87] [mgtv] fix format extraction(closes #26415) --- youtube_dl/extractor/mgtv.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index 71fc3ec56..cab3aa045 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -17,9 +17,8 @@ from ..utils import ( class MGTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html' + _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html' IE_DESC = '芒果TV' - _GEO_COUNTRIES = ['CN'] _TESTS = [{ 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', @@ -34,14 +33,18 @@ class MGTVIE(InfoExtractor): }, { 'url': 'http://www.mgtv.com/b/301817/3826653.html', 'only_matching': True, + }, { + 'url': 'https://w.mgtv.com/b/301817/3826653.html', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + tk2 = base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1] try: api_data = self._download_json( 'https://pcweb.api.mgtv.com/player/video', video_id, query={ - 'tk2': base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1], + 'tk2': tk2, 'video_id': video_id, }, headers=self.geo_verification_headers())['data'] except ExtractorError as e: @@ -56,6 +59,7 @@ class MGTVIE(InfoExtractor): stream_data = self._download_json( 'https://pcweb.api.mgtv.com/player/getSource', video_id, query={ 'pm2': api_data['atc']['pm2'], + 'tk2': tk2, 'video_id': video_id, }, headers=self.geo_verification_headers())['data'] stream_domain = stream_data['stream_domain'][0] From 9b505185da1f7148e02174fce693583af68976d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Nov 2020 03:26:49 +0700 Subject: [PATCH 45/87] [arte] Extract m3u8 formats (closes #27061) --- youtube_dl/extractor/arte.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 2bd3bfe8a..b80467548 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -11,6 +11,7 @@ from ..utils import ( qualities, try_get, unified_strdate, + url_or_none, ) # There are different sources of video in arte.tv, the extraction process @@ -63,8 +64,13 @@ class ArteTVBaseIE(InfoExtractor): langcode = LANGS.get(lang, lang) formats = [] + m3u8_formats = [] for format_id, format_dict in vsr.items(): f = dict(format_dict) + format_url = url_or_none(f.get('url')) + streamer = f.get('streamer') + if not format_url and not streamer: + continue versionCode = f.get('versionCode') l = re.escape(langcode) @@ -107,6 +113,15 @@ class ArteTVBaseIE(InfoExtractor): else: lang_pref = -1 + media_type = f.get('mediaType') + if media_type == 'hls': + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + for m3u8_format in m3u8_formats: + m3u8_format['language_preference'] = lang_pref + continue + format = { 'format_id': format_id, 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, @@ -118,7 +133,7 @@ class ArteTVBaseIE(InfoExtractor): 'quality': qfunc(f.get('quality')), } - if f.get('mediaType') == 'rtmp': + if media_type == 'rtmp': format['url'] = f['streamer'] format['play_path'] = 'mp4:' + f['url'] format['ext'] = 'flv' @@ -128,6 +143,8 @@ class ArteTVBaseIE(InfoExtractor): formats.append(format) self._check_formats(formats, video_id) + + formats.extend(m3u8_formats) self._sort_formats(formats) info_dict['formats'] = formats From 91e954587f5b2b4760b73275ad0720933c0136a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Nov 2020 05:02:04 +0700 Subject: [PATCH 46/87] [arte] Rework extractors * Reimplement embed and playlist extractors to delegate to the single entrypoint artetv extractor Beware reluctant download archive extractor keys breakage. * Improve embeds detection (closes #27057) - Remove obsolete code --- youtube_dl/extractor/arte.py | 154 ++++++++++++++++++----------- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/generic.py | 9 +- 3 files changed, 100 insertions(+), 65 deletions(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index b80467548..03abdbfaf 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( ExtractorError, int_or_none, @@ -14,14 +17,44 @@ from ..utils import ( url_or_none, ) -# There are different sources of video in arte.tv, the extraction process -# is different for each one. The videos usually expire in 7 days, so we can't -# add tests. - class ArteTVBaseIE(InfoExtractor): - def _extract_from_json_url(self, json_url, video_id, lang, title=None): - info = self._download_json(json_url, video_id) + _ARTE_LANGUAGES = 'fr|de|en|es|it|pl' + _API_BASE = 'https://api.arte.tv/api/player/v1' + + +class ArteTVIE(ArteTVBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos| + api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s) + ) + /(?P<id>\d{6}-\d{3}-[AF]) + ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES} + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'info_dict': { + 'id': '088501-000-A', + 'ext': 'mp4', + 'title': 'Mexico: Stealing Petrol to Survive', + 'upload_date': '20190628', + }, + }, { + 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', + 'only_matching': True, + }, { + 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + lang = mobj.group('lang') or mobj.group('lang_2') + + info = self._download_json( + '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id) player_info = info['videoJsonPlayer'] vsr = try_get(player_info, lambda x: x['VSR'], dict) @@ -38,18 +71,11 @@ class ArteTVBaseIE(InfoExtractor): if not upload_date_str: upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] - title = (player_info.get('VTI') or title or player_info['VID']).strip() + title = (player_info.get('VTI') or player_info['VID']).strip() subtitle = player_info.get('VSU', '').strip() if subtitle: title += ' - %s' % subtitle - info_dict = { - 'id': player_info['VID'], - 'title': title, - 'description': player_info.get('VDE'), - 'upload_date': unified_strdate(upload_date_str), - 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), - } qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) LANGS = { @@ -64,7 +90,6 @@ class ArteTVBaseIE(InfoExtractor): langcode = LANGS.get(lang, lang) formats = [] - m3u8_formats = [] for format_id, format_dict in vsr.items(): f = dict(format_dict) format_url = url_or_none(f.get('url')) @@ -120,6 +145,7 @@ class ArteTVBaseIE(InfoExtractor): m3u8_id=format_id, fatal=False) for m3u8_format in m3u8_formats: m3u8_format['language_preference'] = lang_pref + formats.extend(m3u8_formats) continue format = { @@ -142,58 +168,50 @@ class ArteTVBaseIE(InfoExtractor): formats.append(format) - self._check_formats(formats, video_id) - - formats.extend(m3u8_formats) self._sort_formats(formats) - info_dict['formats'] = formats - return info_dict + return { + 'id': player_info.get('VID') or video_id, + 'title': title, + 'description': player_info.get('VDE'), + 'upload_date': unified_strdate(upload_date_str), + 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), + 'formats': formats, + } -class ArteTVPlus7IE(ArteTVBaseIE): - IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])' - +class ArteTVEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' _TESTS = [{ - 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', 'info_dict': { - 'id': '088501-000-A', + 'id': '100605-013-A', 'ext': 'mp4', - 'title': 'Mexico: Stealing Petrol to Survive', - 'upload_date': '20190628', + 'title': 'United we Stream November Lockdown Edition #13', + 'description': 'md5:be40b667f45189632b78c1425c7c2ce1', + 'upload_date': '20201116', }, + }, { + 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', + 'only_matching': True, }] - def _real_extract(self, url): - lang, video_id = re.match(self._VALID_URL, url).groups() - return self._extract_from_json_url( - 'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id), - video_id, lang) - - -class ArteTVEmbedIE(ArteTVPlus7IE): - IE_NAME = 'arte.tv:embed' - _VALID_URL = r'''(?x) - https://www\.arte\.tv - /player/v3/index\.php\?json_url= - (?P<json_url> - https?://api\.arte\.tv/api/player/v1/config/ - (?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF]) - ) - ''' - - _TESTS = [] + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1', + webpage)] def _real_extract(self, url): - json_url, lang, video_id = re.match(self._VALID_URL, url).groups() - return self._extract_from_json_url(json_url, video_id, lang) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + json_url = qs['json_url'][0] + video_id = ArteTVIE._match_id(json_url) + return self.url_result( + json_url, ie=ArteTVIE.ie_key(), video_id=video_id) class ArteTVPlaylistIE(ArteTVBaseIE): - IE_NAME = 'arte.tv:playlist' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})' - + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', 'info_dict': { @@ -202,17 +220,35 @@ class ArteTVPlaylistIE(ArteTVBaseIE): 'description': 'md5:d322c55011514b3a7241f7fb80d494c2', }, 'playlist_mincount': 6, + }, { + 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/', + 'only_matching': True, }] def _real_extract(self, url): lang, playlist_id = re.match(self._VALID_URL, url).groups() collection = self._download_json( - 'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' - % (lang, playlist_id), playlist_id) + '%s/collectionData/%s/%s?source=videos' + % (self._API_BASE, lang, playlist_id), playlist_id) + entries = [] + for video in collection['videos']: + if not isinstance(video, dict): + continue + video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl')) + if not video_url: + continue + video_id = video.get('programId') + entries.append({ + '_type': 'url_transparent', + 'url': video_url, + 'id': video_id, + 'title': video.get('title'), + 'alt_title': video.get('subtitle'), + 'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)), + 'duration': int_or_none(video.get('durationSeconds')), + 'view_count': int_or_none(video.get('views')), + 'ie_key': ArteTVIE.ie_key(), + }) title = collection.get('title') description = collection.get('shortDescription') or collection.get('teaserText') - entries = [ - self._extract_from_json_url( - video['jsonUrl'], video.get('programId') or playlist_id, lang) - for video in collection['videos'] if video.get('jsonUrl')] return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 11ef47261..088800eb9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -58,7 +58,7 @@ from .ard import ( ARDMediathekIE, ) from .arte import ( - ArteTVPlus7IE, + ArteTVIE, ArteTVEmbedIE, ArteTVPlaylistIE, ) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 355067a50..d08a8cca5 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -91,6 +91,7 @@ from .piksel import PikselIE from .videa import VideaIE from .twentymin import TwentyMinutenIE from .ustream import UstreamIE +from .arte import ArteTVEmbedIE from .videopress import VideoPressIE from .rutube import RutubeIE from .limelight import LimelightBaseIE @@ -2760,11 +2761,9 @@ class GenericIE(InfoExtractor): return self.url_result(ustream_url, UstreamIE.ie_key()) # Look for embedded arte.tv player - mobj = re.search( - r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'ArteTVEmbed') + arte_urls = ArteTVEmbedIE._extract_urls(webpage) + if arte_urls: + return self.playlist_from_matches(arte_urls, video_id, video_title) # Look for embedded francetv player mobj = re.search( From b1347a5881fa09777408a77360f0b5ce7ae6450c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Nov 2020 05:16:25 +0700 Subject: [PATCH 47/87] [youporn] Fix upload date extraction and make comment count optional (closes #26986) --- youtube_dl/extractor/youporn.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index e7fca22de..7b9feafeb 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -29,7 +29,6 @@ class YouPornIE(InfoExtractor): 'upload_date': '20101217', 'average_rating': int, 'view_count': int, - 'comment_count': int, 'categories': list, 'tags': list, 'age_limit': 18, @@ -48,7 +47,6 @@ class YouPornIE(InfoExtractor): 'upload_date': '20110418', 'average_rating': int, 'view_count': int, - 'comment_count': int, 'categories': list, 'tags': list, 'age_limit': 18, @@ -156,7 +154,8 @@ class YouPornIE(InfoExtractor): r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( - [r'Date\s+[Aa]dded:\s*<span>([^<]+)', + [r'UPLOADED:\s*<span>([^<]+)', + r'Date\s+[Aa]dded:\s*<span>([^<]+)', r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'], webpage, 'upload date', fatal=False)) @@ -171,7 +170,7 @@ class YouPornIE(InfoExtractor): webpage, 'view count', fatal=False, group='count')) comment_count = str_to_int(self._search_regex( r'>All [Cc]omments? \(([\d,.]+)\)', - webpage, 'comment count', fatal=False)) + webpage, 'comment count', default=None)) def extract_tag_box(regex, title): tag_box = self._search_regex(regex, webpage, title, default=None) From 32152bab7a488f8b563444e8d8cddd6d09f8a54a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Nov 2020 05:21:09 +0700 Subject: [PATCH 48/87] [ChangeLog] Actualize [ci skip] --- ChangeLog | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4d404a56e..a93fcbf7f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,23 @@ +version <unreleased> + +Core +* [extractor/common] Output error for invalid URLs in _is_valid_url (#21400, + #24151, #25617, #25618, #25586, #26068, #27072) + +Extractors +* [youporn] Fix upload date extraction +* [youporn] Make comment count optional (#26986) +* [arte] Rework extractors + * Reimplement embed and playlist extractors to delegate to the single + entrypoint artetv extractor + * Improve embeds detection (#27057) ++ [arte] Extract m3u8 formats (#27061) +* [mgtv] Fix format extraction (#26415) ++ [lbry] Add support for odysee.com (#26806) +* [francetv] Improve info extraction ++ [francetv] Add fallback video URL extraction (#27047) + + version 2020.11.18 Extractors From 039e715b3020710546b13976042bd18ab6e2df5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Nov 2020 05:22:27 +0700 Subject: [PATCH 49/87] release 2020.11.19 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 6 +++--- youtube_dl/version.py | 2 +- 8 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 64a2eb736..e2e5a15ec 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.18. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.19. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.11.18** +- [ ] I've verified that I'm running youtube-dl version **2020.11.19** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.18 + [debug] youtube-dl version 2020.11.19 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index c0c789673..880a96835 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.18. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.19. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.11.18** +- [ ] I've verified that I'm running youtube-dl version **2020.11.19** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index c90dbf30c..25c5a7daf 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.18. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.19. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.18** +- [ ] I've verified that I'm running youtube-dl version **2020.11.19** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 4c9d295de..59716f962 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.18. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.19. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.11.18** +- [ ] I've verified that I'm running youtube-dl version **2020.11.19** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.18 + [debug] youtube-dl version 2020.11.19 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index a6040daee..410abee90 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.18. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.19. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.18** +- [ ] I've verified that I'm running youtube-dl version **2020.11.19** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index a93fcbf7f..572e0360d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2020.11.19 Core * [extractor/common] Output error for invalid URLs in _is_valid_url (#21400, diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 9f0cd6ff6..37425e7a1 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -58,9 +58,9 @@ - **ARD:mediathek** - **ARDBetaMediathek** - **Arkena** - - **arte.tv:+7** - - **arte.tv:embed** - - **arte.tv:playlist** + - **ArteTV** + - **ArteTVEmbed** + - **ArteTVPlaylist** - **AsianCrush** - **AsianCrushPlaylist** - **AtresPlayer** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d4c6b936f..53eeb6ccf 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.11.18' +__version__ = '2020.11.19' From 4fe190df705d16c66fc3e7b2d798ff14ebbc9878 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 19 Nov 2020 11:54:54 +0100 Subject: [PATCH 50/87] [mtv] fix mgid extraction(closes #26841) --- youtube_dl/extractor/mtv.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index fedd5f46b..df1034fc5 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -349,6 +349,18 @@ class MTVIE(MTVServicesInfoExtractor): 'only_matching': True, }] + @staticmethod + def extract_child_with_type(parent, t): + children = parent['children'] + return next(c for c in children if c.get('type') == t) + + def _extract_mgid(self, webpage): + data = self._parse_json(self._search_regex( + r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) + main_container = self.extract_child_with_type(data, 'MainContainer') + video_player = self.extract_child_with_type(main_container, 'VideoPlayer') + return video_player['props']['media']['video']['config']['uri'] + class MTVJapanIE(MTVServicesInfoExtractor): IE_NAME = 'mtvjapan' From 2dbb45ae82836699486d434cfb6d902920bab66e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 19 Nov 2020 13:12:58 +0100 Subject: [PATCH 51/87] [vimeo:album] fix extraction(closes #27079) --- youtube_dl/extractor/vimeo.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 421795b94..cfd04d50c 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -946,10 +946,13 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): def _real_extract(self, url): album_id = self._match_id(url) - webpage = self._download_webpage(url, album_id) - viewer = self._parse_json(self._search_regex( - r'bootstrap_data\s*=\s*({.+?})</script>', - webpage, 'bootstrap data'), album_id)['viewer'] + viewer = self._download_json( + 'https://vimeo.com/_rv/viewer', album_id, fatal=False) + if not viewer: + webpage = self._download_webpage(url, album_id) + viewer = self._parse_json(self._search_regex( + r'bootstrap_data\s*=\s*({.+?})</script>', + webpage, 'bootstrap data'), album_id)['viewer'] jwt = viewer['jwt'] album = self._download_json( 'https://api.vimeo.com/albums/' + album_id, From cf1a8668e8e47a56c834fb567d227787d7480d08 Mon Sep 17 00:00:00 2001 From: Joost Verdoorn <jpverdoorn@gmail.com> Date: Thu, 19 Nov 2020 17:26:53 +0100 Subject: [PATCH 52/87] [Amara] Add new extractor (#20618) * [Amara] Add new extractor --- youtube_dl/extractor/amara.py | 76 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 77 insertions(+) create mode 100644 youtube_dl/extractor/amara.py diff --git a/youtube_dl/extractor/amara.py b/youtube_dl/extractor/amara.py new file mode 100644 index 000000000..b222154bd --- /dev/null +++ b/youtube_dl/extractor/amara.py @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals +from .common import InfoExtractor + + +class AmaraIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)' + _TESTS = [ + { + 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', + 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', + 'info_dict': { + 'id': 'h6ZuVdvYnfE', + 'ext': 'mp4', + 'title': 'Why jury trials are becoming less common', + 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20160813', + 'uploader': 'PBS NewsHour', + 'uploader_id': 'PBSNewsHour' + } + }, + { + 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', + 'md5': '99392c75fa05d432a8f11df03612195e', + 'info_dict': { + 'id': '18622084', + 'ext': 'mov', + 'title': 'Vimeo at CES 2011!', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'timestamp': 1294649110, + 'upload_date': '20110110', + 'uploader': 'Sam Morrill', + 'uploader_id': 'sammorrill' + } + }, + { + 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', + 'md5': 'd3970f08512738ee60c5807311ff5d3f', + 'info_dict': { + 'id': 'ChimamandaAdichie_2009G-transcript', + 'ext': 'mp4', + 'title': 'The danger of a single story', + 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20131206' + } + } + ] + + def get_subtitles_for_language(self, language): + return [{ + 'ext': type, + 'url': language['subtitles_uri'].replace('format=json', 'format=' + type) + } for type in ['vtt', 'srt', 'json']] + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._download_json('https://amara.org/api/videos/%s/' % video_id, video_id, query={'format': 'json'}) + + video_url = meta.get('all_urls')[0] + subtitles = dict([(language['code'], self.get_subtitles_for_language(language)) for language in meta.get('languages', []) if language['published']]) + + return { + '_type': 'url_transparent', + 'url': video_url, + 'id': video_id, + 'subtitles': subtitles, + 'title': meta['title'], + 'description': meta.get('description'), + 'thumbnail': meta.get('thumbnail') + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 088800eb9..183050e07 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -36,6 +36,7 @@ from .afreecatv import AfreecaTVIE from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE +from .amara import AmaraIE from .amcnetworks import AMCNetworksIE from .americastestkitchen import AmericasTestKitchenIE from .animeondemand import AnimeOnDemandIE From 2cf8003638ef76a0f76541229ecab1adf739a3ae Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 19 Nov 2020 17:29:30 +0100 Subject: [PATCH 53/87] [amara] improve extraction --- youtube_dl/extractor/amara.py | 143 ++++++++++++++++++++-------------- 1 file changed, 85 insertions(+), 58 deletions(-) diff --git a/youtube_dl/extractor/amara.py b/youtube_dl/extractor/amara.py index b222154bd..61d469574 100644 --- a/youtube_dl/extractor/amara.py +++ b/youtube_dl/extractor/amara.py @@ -1,76 +1,103 @@ # coding: utf-8 from __future__ import unicode_literals + from .common import InfoExtractor +from .youtube import YoutubeIE +from .vimeo import VimeoIE +from ..utils import ( + int_or_none, + parse_iso8601, + update_url_query, +) class AmaraIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)' - _TESTS = [ - { - 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', - 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', - 'info_dict': { - 'id': 'h6ZuVdvYnfE', - 'ext': 'mp4', - 'title': 'Why jury trials are becoming less common', - 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', - 'thumbnail': r're:^https?://.*\.jpg$', - 'subtitles': dict, - 'upload_date': '20160813', - 'uploader': 'PBS NewsHour', - 'uploader_id': 'PBSNewsHour' - } - }, - { - 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', - 'md5': '99392c75fa05d432a8f11df03612195e', - 'info_dict': { - 'id': '18622084', - 'ext': 'mov', - 'title': 'Vimeo at CES 2011!', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'subtitles': dict, - 'timestamp': 1294649110, - 'upload_date': '20110110', - 'uploader': 'Sam Morrill', - 'uploader_id': 'sammorrill' - } - }, - { - 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', - 'md5': 'd3970f08512738ee60c5807311ff5d3f', - 'info_dict': { - 'id': 'ChimamandaAdichie_2009G-transcript', - 'ext': 'mp4', - 'title': 'The danger of a single story', - 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', - 'thumbnail': r're:^https?://.*\.jpg$', - 'subtitles': dict, - 'upload_date': '20131206' - } + _TESTS = [{ + # Youtube + 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', + 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', + 'info_dict': { + 'id': 'h6ZuVdvYnfE', + 'ext': 'mp4', + 'title': 'Why jury trials are becoming less common', + 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20160813', + 'uploader': 'PBS NewsHour', + 'uploader_id': 'PBSNewsHour', + 'timestamp': 1549639570, } - ] - - def get_subtitles_for_language(self, language): - return [{ - 'ext': type, - 'url': language['subtitles_uri'].replace('format=json', 'format=' + type) - } for type in ['vtt', 'srt', 'json']] + }, { + # Vimeo + 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', + 'md5': '99392c75fa05d432a8f11df03612195e', + 'info_dict': { + 'id': '18622084', + 'ext': 'mov', + 'title': 'Vimeo at CES 2011!', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'timestamp': 1294763658, + 'upload_date': '20110111', + 'uploader': 'Sam Morrill', + 'uploader_id': 'sammorrill' + } + }, { + # Direct Link + 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', + 'md5': 'd3970f08512738ee60c5807311ff5d3f', + 'info_dict': { + 'id': 's8KL7I3jLmh6', + 'ext': 'mp4', + 'title': 'The danger of a single story', + 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20091007', + 'timestamp': 1254942511, + } + }] def _real_extract(self, url): video_id = self._match_id(url) - meta = self._download_json('https://amara.org/api/videos/%s/' % video_id, video_id, query={'format': 'json'}) + meta = self._download_json( + 'https://amara.org/api/videos/%s/' % video_id, + video_id, query={'format': 'json'}) + title = meta['title'] + video_url = meta['all_urls'][0] - video_url = meta.get('all_urls')[0] - subtitles = dict([(language['code'], self.get_subtitles_for_language(language)) for language in meta.get('languages', []) if language['published']]) + subtitles = {} + for language in (meta.get('languages') or []): + subtitles_uri = language.get('subtitles_uri') + if not (subtitles_uri and language.get('published')): + continue + subtitle = subtitles.setdefault(language.get('code') or 'en', []) + for f in ('json', 'srt', 'vtt'): + subtitle.append({ + 'ext': f, + 'url': update_url_query(subtitles_uri, {'format': f}), + }) - return { - '_type': 'url_transparent', + info = { 'url': video_url, 'id': video_id, 'subtitles': subtitles, - 'title': meta['title'], + 'title': title, 'description': meta.get('description'), - 'thumbnail': meta.get('thumbnail') + 'thumbnail': meta.get('thumbnail'), + 'duration': int_or_none(meta.get('duration')), + 'timestamp': parse_iso8601(meta.get('created')), } + + for ie in (YoutubeIE, VimeoIE): + if ie.suitable(video_url): + info.update({ + '_type': 'url_transparent', + 'ie_key': ie.ie_key(), + }) + break + + return info From 25a35cb38a472731d9f487309f6bcff94ee4918c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 19 Nov 2020 20:01:24 +0100 Subject: [PATCH 54/87] [googledrive] fix format extraction(closes #26979) --- youtube_dl/extractor/googledrive.py | 58 +++++++++++------------------ 1 file changed, 21 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index f2cc57e44..de8c80e36 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -3,11 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_parse_qs from ..utils import ( determine_ext, ExtractorError, int_or_none, lowercase_escape, + try_get, update_url_query, ) @@ -38,21 +40,10 @@ class GoogleDriveIE(InfoExtractor): # video can't be watched anonymously due to view count limit reached, # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046) 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', - 'md5': 'bfbd670d03a470bb1e6d4a257adec12e', - 'info_dict': { - 'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ', - 'ext': 'mp4', - 'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4', - } + 'only_matching': True, }, { # video id is longer than 28 characters 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', - 'info_dict': { - 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ', - 'ext': 'mp4', - 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4', - 'duration': 189, - }, 'only_matching': True, }, { 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28', @@ -171,23 +162,21 @@ class GoogleDriveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://docs.google.com/file/d/%s' % video_id, video_id) + video_info = compat_parse_qs(self._download_webpage( + 'https://drive.google.com/get_video_info', + video_id, query={'docid': video_id})) - title = self._search_regex( - r'"title"\s*,\s*"([^"]+)', webpage, 'title', - default=None) or self._og_search_title(webpage) - duration = int_or_none(self._search_regex( - r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', - default=None)) + def get_value(key): + return try_get(video_info, lambda x: x[key][0]) + + reason = get_value('reason') + title = get_value('title') + if not title and reason: + raise ExtractorError(reason, expected=True) formats = [] - fmt_stream_map = self._search_regex( - r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, - 'fmt stream map', default='').split(',') - fmt_list = self._search_regex( - r'"fmt_list"\s*,\s*"([^"]+)', webpage, - 'fmt_list', default='').split(',') + fmt_stream_map = (get_value('fmt_stream_map') or '').split(',') + fmt_list = (get_value('fmt_list') or '').split(',') if fmt_stream_map and fmt_list: resolutions = {} for fmt in fmt_list: @@ -257,19 +246,14 @@ class GoogleDriveIE(InfoExtractor): if urlh and urlh.headers.get('Content-Disposition'): add_source_format(urlh) - if not formats: - reason = self._search_regex( - r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) - if reason: - raise ExtractorError(reason, expected=True) + if not formats and reason: + raise ExtractorError(reason, expected=True) self._sort_formats(formats) - hl = self._search_regex( - r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None) + hl = get_value('hl') subtitles_id = None - ttsurl = self._search_regex( - r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None) + ttsurl = get_value('ttsurl') if ttsurl: # the video Id for subtitles will be the last value in the ttsurl # query string @@ -279,8 +263,8 @@ class GoogleDriveIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'duration': duration, + 'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id, + 'duration': int_or_none(get_value('length_seconds')), 'formats': formats, 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl), 'automatic_captions': self.extract_automatic_captions( From daa25d414284747980a9ad32e138a2ae388fcd0c Mon Sep 17 00:00:00 2001 From: beefchop <32330393+beefchop@users.noreply.github.com> Date: Fri, 20 Nov 2020 07:38:09 +1100 Subject: [PATCH 55/87] [viki] fix stream extraction from mpd (#27092) Co-authored-by: beefchop <beefchop@users.noreply.github.com> --- youtube_dl/extractor/viki.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index b0dcdc0e6..48ab7b944 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -296,6 +296,9 @@ class VikiIE(VikiBaseIE): if f.get('acodec') == 'none' and f.get('vcodec') != 'none': f['acodec'] = None formats.extend(m3u8_formats) + elif format_id == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, 'mpd-%s' % protocol, fatal=False)) elif format_url.startswith('rtmp'): mobj = re.search( r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', From 59e583f7e8530ca92776c866897d895c072e2a82 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 19 Nov 2020 22:45:46 +0100 Subject: [PATCH 56/87] [viki] improve format extraction --- youtube_dl/extractor/viki.py | 142 ++++++++++++++++++++--------------- 1 file changed, 83 insertions(+), 59 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 48ab7b944..a003b7af8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import hashlib import hmac import itertools @@ -9,6 +10,10 @@ import re import time from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( ExtractorError, int_or_none, @@ -165,19 +170,20 @@ class VikiIE(VikiBaseIE): }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '5fa476a902e902783ac7a4d615cdbc7a', + 'md5': '94e0e34fd58f169f40c184f232356cfe', 'info_dict': { 'id': '44699v', 'ext': 'mp4', 'title': 'Boys Over Flowers - Episode 1', 'description': 'md5:b89cf50038b480b88b5b3c93589a9076', - 'duration': 4204, + 'duration': 4172, 'timestamp': 1270496524, 'upload_date': '20100405', 'uploader': 'group8', 'like_count': int, 'age_limit': 13, - } + }, + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # youtube external 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', @@ -194,14 +200,15 @@ class VikiIE(VikiBaseIE): 'uploader_id': 'ad14065n', 'like_count': int, 'age_limit': 13, - } + }, + 'skip': 'Page not found!', }, { 'url': 'http://www.viki.com/player/44699v', 'only_matching': True, }, { # non-English description 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': '1713ae35df5a521b31f6dc40730e7c9c', + 'md5': 'adf9e321a0ae5d0aace349efaaff7691', 'info_dict': { 'id': '158036v', 'ext': 'mp4', @@ -217,8 +224,11 @@ class VikiIE(VikiBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - video = self._call_api( - 'videos/%s.json' % video_id, video_id, 'Downloading video JSON') + resp = self._download_json( + 'https://www.viki.com/api/videos/' + video_id, + video_id, 'Downloading video JSON', + headers={'x-viki-app-ver': '4.0.57'}) + video = resp['video'] self._check_errors(video) @@ -265,60 +275,74 @@ class VikiIE(VikiBaseIE): 'subtitles': subtitles, } - streams = self._call_api( - 'videos/%s/streams.json' % video_id, video_id, - 'Downloading video streams JSON') - - if 'external' in streams: - result.update({ - '_type': 'url_transparent', - 'url': streams['external']['url'], - }) - return result - formats = [] - for format_id, stream_dict in streams.items(): - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)) - for protocol, format_dict in stream_dict.items(): - # rtmps URLs does not seem to work - if protocol == 'rtmps': - continue - format_url = format_dict['url'] - if format_id == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', - m3u8_id='m3u8-%s' % protocol, fatal=False) - # Despite CODECS metadata in m3u8 all video-only formats - # are actually video+audio - for f in m3u8_formats: - if f.get('acodec') == 'none' and f.get('vcodec') != 'none': - f['acodec'] = None - formats.extend(m3u8_formats) - elif format_id == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, video_id, 'mpd-%s' % protocol, fatal=False)) - elif format_url.startswith('rtmp'): - mobj = re.search( - r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', - format_url) - if not mobj: + + def add_format(format_id, format_dict, protocol='http'): + # rtmps URLs does not seem to work + if protocol == 'rtmps': + return + format_url = format_dict.get('url') + if not format_url: + return + qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query) + stream = qs.get('stream', [None])[0] + if stream: + format_url = base64.b64decode(stream).decode() + if format_id in ('m3u8', 'hls'): + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', + m3u8_id='m3u8-%s' % protocol, fatal=False) + # Despite CODECS metadata in m3u8 all video-only formats + # are actually video+audio + for f in m3u8_formats: + if '_drm/index_' in f['url']: continue - formats.append({ - 'format_id': 'rtmp-%s' % format_id, - 'ext': 'flv', - 'url': mobj.group('url'), - 'play_path': mobj.group('playpath'), - 'app': mobj.group('app'), - 'page_url': url, - }) - else: - formats.append({ - 'url': format_url, - 'format_id': '%s-%s' % (format_id, protocol), - 'height': height, - }) + if f.get('acodec') == 'none' and f.get('vcodec') != 'none': + f['acodec'] = None + formats.append(f) + elif format_id in ('mpd', 'dash'): + formats.extend(self._extract_mpd_formats( + format_url, video_id, 'mpd-%s' % protocol, fatal=False)) + elif format_url.startswith('rtmp'): + mobj = re.search( + r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', + format_url) + if not mobj: + return + formats.append({ + 'format_id': 'rtmp-%s' % format_id, + 'ext': 'flv', + 'url': mobj.group('url'), + 'play_path': mobj.group('playpath'), + 'app': mobj.group('app'), + 'page_url': url, + }) + else: + formats.append({ + 'url': format_url, + 'format_id': '%s-%s' % (format_id, protocol), + 'height': int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)), + }) + + for format_id, format_dict in (resp.get('streams') or {}).items(): + add_format(format_id, format_dict) + if not formats: + streams = self._call_api( + 'videos/%s/streams.json' % video_id, video_id, + 'Downloading video streams JSON') + + if 'external' in streams: + result.update({ + '_type': 'url_transparent', + 'url': streams['external']['url'], + }) + return result + + for format_id, stream_dict in streams.items(): + for protocol, format_dict in stream_dict.items(): + add_format(format_id, format_dict, protocol) self._sort_formats(formats) result['formats'] = formats From dd9e0f58f3482204491007e06a134c69788b1c82 Mon Sep 17 00:00:00 2001 From: Leonardo Taccari <iamleot@gmail.com> Date: Fri, 20 Nov 2020 10:00:05 +0100 Subject: [PATCH 57/87] [rai] Fix extraction for recent raiplay.it updates (#27077) - Remove first test of RaiPlayIE: it is no longer available - Make RaiPlayIE extension-agnostic (passing possible `.json' URLs is now supported too) - Adjust RaiPlayLiveIE to recent raiplay.it updates. Passing it as `url_transparent' is no longer supported (there is no longer an accessible ContentItem) - Adjust RaiPlayPlaylistIE to recent raiplay.it updates and instruct it about ContentSet-s. - Update a RaiIE test and remove two tests that are no longer availables Thanks to @remitamine for the review! --- youtube_dl/extractor/rai.py | 126 +++++++++++++++--------------------- 1 file changed, 52 insertions(+), 74 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index bee2d53f5..dae7800d2 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -17,7 +17,6 @@ from ..utils import ( int_or_none, parse_duration, strip_or_none, - unescapeHTML, unified_strdate, unified_timestamp, update_url_query, @@ -122,27 +121,8 @@ class RaiBaseIE(InfoExtractor): class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE._UUID_RE + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-)(?P<id>%s)\.(?:html|json)' % RaiBaseIE._UUID_RE _TESTS = [{ - 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', - 'md5': '340aa3b7afb54bfd14a8c11786450d76', - 'info_dict': { - 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', - 'ext': 'mp4', - 'title': 'La Casa Bianca', - 'alt_title': 'S2016 - Puntata del 23/10/2016', - 'description': 'md5:a09d45890850458077d1f68bb036e0a5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 3', - 'creator': 'Rai 3', - 'duration': 3278, - 'timestamp': 1477764300, - 'upload_date': '20161029', - 'series': 'La Casa Bianca', - 'season': '2016', - }, - 'skip': 'This content is not available', - }, { 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { @@ -166,10 +146,11 @@ class RaiPlayIE(RaiBaseIE): }] def _real_extract(self, url): - url, video_id = re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + base, video_id, = mobj.group('base', 'id') media = self._download_json( - url.replace('.html', '.json'), video_id, 'Downloading video JSON') + '%s%s.json' % (base, video_id), video_id, 'Downloading video JSON') title = media['name'] @@ -219,7 +200,7 @@ class RaiPlayIE(RaiBaseIE): class RaiPlayLiveIE(RaiBaseIE): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)' + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))' _TEST = { 'url': 'http://www.raiplay.it/dirette/rainews24', 'info_dict': { @@ -227,7 +208,7 @@ class RaiPlayLiveIE(RaiBaseIE): 'display_id': 'rainews24', 'ext': 'mp4', 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:6eca31500550f9376819f174e5644754', + 'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497', 'uploader': 'Rai News 24', 'creator': 'Rai News 24', 'is_live': True, @@ -238,53 +219,75 @@ class RaiPlayLiveIE(RaiBaseIE): } def _real_extract(self, url): - display_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + base, display_id, = mobj.group('base', 'id') - webpage = self._download_webpage(url, display_id) + media = self._download_json( + '%s.json' % base, + display_id, 'Downloading channel JSON') - video_id = self._search_regex( - r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE, - webpage, 'content id') + title = media['name'] + video = media['video'] + video_id = media['id'].replace('ContentItem-', '') - return { - '_type': 'url_transparent', - 'ie_key': RaiPlayIE.ie_key(), - 'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, + relinker_info = self._extract_relinker_info(video['content_url'], video_id) + self._sort_formats(relinker_info['formats']) + + info = { 'id': video_id, 'display_id': display_id, + 'title': self._live_title(title) if relinker_info.get( + 'is_live') else title, + 'description': media.get('description'), + 'uploader': strip_or_none(media.get('channel')), + 'creator': strip_or_none(media.get('editor')), } + info.update(relinker_info) + return info + class RaiPlayPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)' + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo', - 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86', + 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', }, 'playlist_mincount': 12, }] def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + base, playlist_id, = mobj.group('base', 'id') - webpage = self._download_webpage(url, playlist_id) + media = self._download_json( + '%s.json' % base, + playlist_id, 'Downloading program JSON') - title = self._html_search_meta( - ('programma', 'nomeProgramma'), webpage, 'title') - description = unescapeHTML(self._html_search_meta( - ('description', 'og:description'), webpage, 'description')) + title = media.get('name') + description = None + if media.get('program_info') and media['program_info'].get('description'): + description = media['program_info']['description'] entries = [] - for mobj in re.finditer( - r'<a\b[^>]+\bhref=(["\'])(?P<path>/raiplay/video/.+?)\1', - webpage): - video_url = urljoin(url, mobj.group('path')) - entries.append(self.url_result( - video_url, ie=RaiPlayIE.ie_key(), - video_id=RaiPlayIE._match_id(video_url))) + for b in media.get('blocks', []): + for s in b.get('sets', []): + cs = s.get('id') + if not cs: + continue + medias = self._download_json( + '%s/%s.json' % (base, cs), + cs, 'Downloading content set JSON', fatal=False) + if not medias: + continue + for m in medias['items']: + video_url = urljoin(url, m['path_id']) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) return self.playlist_result(entries, playlist_id, title, description) @@ -329,19 +332,6 @@ class RaiIE(RaiBaseIE): 'duration': 2214, 'upload_date': '20161103', } - }, { - # drawMediaRaiTV(...) - 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': '2dd727e61114e1ee9c47f0da6914e178', - 'info_dict': { - 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'mp4', - 'title': 'Il pacco', - 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20141221', - }, - 'skip': 'This content is not available', }, { # initEdizione('ContentItem-...' 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', @@ -353,18 +343,6 @@ class RaiIE(RaiBaseIE): 'upload_date': '20170401', }, 'skip': 'Changes daily', - }, { - # HDS live stream with only relinker URL - 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', - 'info_dict': { - 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', - 'ext': 'flv', - 'title': 'EuroNews', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This content is available only in Italy', }, { # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', From af7bb684c086292e855d4e6f37a724ef27e14bd8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 20 Nov 2020 10:01:56 +0100 Subject: [PATCH 58/87] [rai] improve extraction --- youtube_dl/extractor/rai.py | 82 ++++++++++++------------------------- 1 file changed, 27 insertions(+), 55 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index dae7800d2..b072a0f38 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -16,7 +16,9 @@ from ..utils import ( GeoRestrictedError, int_or_none, parse_duration, + remove_start, strip_or_none, + try_get, unified_strdate, unified_timestamp, update_url_query, @@ -121,7 +123,7 @@ class RaiBaseIE(InfoExtractor): class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-)(?P<id>%s)\.(?:html|json)' % RaiBaseIE._UUID_RE + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s))\.(?:html|json)' % RaiBaseIE._UUID_RE _TESTS = [{ 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', @@ -146,11 +148,10 @@ class RaiPlayIE(RaiBaseIE): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - base, video_id, = mobj.group('base', 'id') + base, video_id = re.match(self._VALID_URL, url).groups() media = self._download_json( - '%s%s.json' % (base, video_id), video_id, 'Downloading video JSON') + base + '.json', video_id, 'Downloading video JSON') title = media['name'] @@ -177,7 +178,8 @@ class RaiPlayIE(RaiBaseIE): season = media.get('season') info = { - 'id': video_id, + 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, + 'display_id': video_id, 'title': self._live_title(title) if relinker_info.get( 'is_live') else title, 'alt_title': strip_or_none(media.get('subtitle')), @@ -199,9 +201,9 @@ class RaiPlayIE(RaiBaseIE): return info -class RaiPlayLiveIE(RaiBaseIE): +class RaiPlayLiveIE(RaiPlayIE): _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))' - _TEST = { + _TESTS = [{ 'url': 'http://www.raiplay.it/dirette/rainews24', 'info_dict': { 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', @@ -216,35 +218,7 @@ class RaiPlayLiveIE(RaiBaseIE): 'params': { 'skip_download': True, }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - base, display_id, = mobj.group('base', 'id') - - media = self._download_json( - '%s.json' % base, - display_id, 'Downloading channel JSON') - - title = media['name'] - video = media['video'] - video_id = media['id'].replace('ContentItem-', '') - - relinker_info = self._extract_relinker_info(video['content_url'], video_id) - self._sort_formats(relinker_info['formats']) - - info = { - 'id': video_id, - 'display_id': display_id, - 'title': self._live_title(title) if relinker_info.get( - 'is_live') else title, - 'description': media.get('description'), - 'uploader': strip_or_none(media.get('channel')), - 'creator': strip_or_none(media.get('editor')), - } - - info.update(relinker_info) - return info + }] class RaiPlayPlaylistIE(InfoExtractor): @@ -260,36 +234,34 @@ class RaiPlayPlaylistIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - base, playlist_id, = mobj.group('base', 'id') + base, playlist_id = re.match(self._VALID_URL, url).groups() - media = self._download_json( - '%s.json' % base, - playlist_id, 'Downloading program JSON') - - title = media.get('name') - description = None - if media.get('program_info') and media['program_info'].get('description'): - description = media['program_info']['description'] + program = self._download_json( + base + '.json', playlist_id, 'Downloading program JSON') entries = [] - for b in media.get('blocks', []): - for s in b.get('sets', []): - cs = s.get('id') - if not cs: + for b in (program.get('blocks') or []): + for s in (b.get('sets') or []): + s_id = s.get('id') + if not s_id: continue medias = self._download_json( - '%s/%s.json' % (base, cs), - cs, 'Downloading content set JSON', fatal=False) + '%s/%s.json' % (base, s_id), s_id, + 'Downloading content set JSON', fatal=False) if not medias: continue - for m in medias['items']: - video_url = urljoin(url, m['path_id']) + for m in (medias.get('items') or []): + path_id = m.get('path_id') + if not path_id: + continue + video_url = urljoin(url, path_id) entries.append(self.url_result( video_url, ie=RaiPlayIE.ie_key(), video_id=RaiPlayIE._match_id(video_url))) - return self.playlist_result(entries, playlist_id, title, description) + return self.playlist_result( + entries, playlist_id, program.get('name'), + try_get(program, lambda x: x['program_info']['description'])) class RaiIE(RaiBaseIE): From a78e530c14cf7caf9449918285653d8284ef30dc Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 20 Nov 2020 10:10:57 +0100 Subject: [PATCH 59/87] [rai] fix unavailable video format detection --- youtube_dl/extractor/rai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index b072a0f38..06958966f 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -68,7 +68,7 @@ class RaiBaseIE(InfoExtractor): # This does not imply geo restriction (e.g. # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) - if media_url == 'http://download.rai.it/video_no_available.mp4': + if '/video_no_available.mp4' in media_url: continue ext = determine_ext(media_url) From 7bc7fbce239da880f7ab67fc5be55ea82df64e20 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 20 Nov 2020 10:26:55 +0100 Subject: [PATCH 60/87] [rai] fix protocol relative relinker URLs(closes #22766) --- youtube_dl/extractor/rai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 06958966f..ecb628f14 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -424,7 +424,7 @@ class RaiIE(RaiBaseIE): except ExtractorError: pass - relinker_url = self._search_regex( + relinker_url = self._proto_relative_url(self._search_regex( r'''(?x) (?: var\s+videoURL| @@ -436,7 +436,7 @@ class RaiIE(RaiBaseIE): //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 ''', - webpage, 'relinker URL', group='url') + webpage, 'relinker URL', group='url')) relinker_info = self._extract_relinker_info( urljoin(url, relinker_url), video_id) From 86f2fa1590991fffae7b1daacae9164771312c0b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 20 Nov 2020 10:47:52 +0100 Subject: [PATCH 61/87] [discoverynetworks] add support new TLC/DMAX URLs(closes #27100) --- youtube_dl/extractor/discoverynetworks.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/discoverynetworks.py b/youtube_dl/extractor/discoverynetworks.py index 607a54948..c512b95d0 100644 --- a/youtube_dl/extractor/discoverynetworks.py +++ b/youtube_dl/extractor/discoverynetworks.py @@ -7,7 +7,7 @@ from .dplay import DPlayIE class DiscoveryNetworksDeIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show)/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)' _TESTS = [{ 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', @@ -29,6 +29,9 @@ class DiscoveryNetworksDeIE(DPlayIE): }, { 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', 'only_matching': True, + }, { + 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', + 'only_matching': True, }] def _real_extract(self, url): From b31b5f4434b52816f3a5a1ae2cbe1d162be0fbd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 20 Nov 2020 23:21:52 +0700 Subject: [PATCH 62/87] [youtube] Improve yt initial data extraction (closes #27093) --- youtube_dl/extractor/youtube.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 79f87aa85..a85aede8e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -283,6 +283,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): }, } + _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' + def _call_api(self, ep, query, video_id): data = self._DEFAULT_API_DATA.copy() data.update(query) @@ -299,8 +301,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _extract_yt_initial_data(self, video_id, webpage): return self._parse_json( self._search_regex( - r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;', - webpage, 'yt initial data'), + (r'%s\s*\n' % self._YT_INITIAL_DATA_RE, + self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'), video_id) @@ -1066,6 +1068,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + # with '};' inside yt initial data (see https://github.com/ytdl-org/youtube-dl/issues/27093) + 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no', + 'info_dict': { + 'id': 'CHqg6qOn4no', + 'ext': 'mp4', + 'title': 'Part 77 Sort a list of simple types in c#', + 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc', + 'upload_date': '20130831', + 'uploader_id': 'kudvenkat', + 'uploader': 'kudvenkat', + }, + 'params': { + 'skip_download': True, + }, + }, ] def __init__(self, *args, **kwargs): From ec99f4710877731da4619617a89cf1dd45a2fc2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 20 Nov 2020 23:34:46 +0700 Subject: [PATCH 63/87] [youtube:tab] Replace some test URLs with RIAA-friendly ones --- youtube_dl/extractor/youtube.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a85aede8e..fd9e54c1f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2474,13 +2474,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, 'playlist_mincount': 138, }, { - 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', + 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, }, { - 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA', + 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, }, { - 'url': 'https://music.youtube.com/channel/UCT-K0qO8z6NzWrywqefBPBQ', + 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', @@ -2527,7 +2527,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, 'playlist_mincount': 11, }, { - 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', + 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'only_matching': True, }, { # Playlist URL that does not actually serve a playlist From ab0eda99e1d1c6cd6aa697f4931c439bec350bd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Nov 2020 04:00:08 +0700 Subject: [PATCH 64/87] [YoutubeDL] Fix --ignore-errors for playlists with generator-based entries of url_transparent (closes #27064) --- test/test_YoutubeDL.py | 70 +++++++++++++++++++++++++++++++++++++++++ youtube_dl/YoutubeDL.py | 52 +++++++++++++++++------------- 2 files changed, 101 insertions(+), 21 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 1e204e551..62f916d11 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -919,6 +919,76 @@ class TestYoutubeDL(unittest.TestCase): self.assertEqual(downloaded['extractor'], 'testex') self.assertEqual(downloaded['extractor_key'], 'TestEx') + # Test case for https://github.com/ytdl-org/youtube-dl/issues/27064 + def test_ignoreerrors_for_playlist_with_url_transparent_iterable_entries(self): + + class _YDL(YDL): + def __init__(self, *args, **kwargs): + super(_YDL, self).__init__(*args, **kwargs) + + def trouble(self, s, tb=None): + pass + + ydl = _YDL({ + 'format': 'extra', + 'ignoreerrors': True, + }) + + class VideoIE(InfoExtractor): + _VALID_URL = r'video:(?P<id>\d+)' + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = [{ + 'format_id': 'default', + 'url': 'url:', + }] + if video_id == '0': + raise ExtractorError('foo') + if video_id == '2': + formats.append({ + 'format_id': 'extra', + 'url': TEST_URL, + }) + return { + 'id': video_id, + 'title': 'Video %s' % video_id, + 'formats': formats, + } + + class PlaylistIE(InfoExtractor): + _VALID_URL = r'playlist:' + + def _entries(self): + for n in range(3): + video_id = compat_str(n) + yield { + '_type': 'url_transparent', + 'ie_key': VideoIE.ie_key(), + 'id': video_id, + 'url': 'video:%s' % video_id, + 'title': 'Video Transparent %s' % video_id, + } + + def _real_extract(self, url): + return self.playlist_result(self._entries()) + + ydl.add_info_extractor(VideoIE(ydl)) + ydl.add_info_extractor(PlaylistIE(ydl)) + info = ydl.extract_info('playlist:') + entries = info['entries'] + self.assertEqual(len(entries), 3) + self.assertTrue(entries[0] is None) + self.assertTrue(entries[1] is None) + self.assertEqual(len(ydl.downloaded_info_dicts), 1) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(entries[2], downloaded) + self.assertEqual(downloaded['url'], TEST_URL) + self.assertEqual(downloaded['title'], 'Video Transparent 2') + self.assertEqual(downloaded['id'], '2') + self.assertEqual(downloaded['extractor'], 'Video') + self.assertEqual(downloaded['extractor_key'], 'Video') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 19370f62b..855a73157 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -793,21 +793,14 @@ class YoutubeDL(object): self.report_warning('The program functionality for this site has been marked as broken, ' 'and will probably not work.') + return self.__extract_info(url, ie, download, extra_info, process) + else: + self.report_error('no suitable InfoExtractor for URL %s' % url) + + def __handle_extraction_exceptions(func): + def wrapper(self, *args, **kwargs): try: - ie_result = ie.extract(url) - if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) - break - if isinstance(ie_result, list): - # Backwards compatibility: old IE result format - ie_result = { - '_type': 'compat_list', - 'entries': ie_result, - } - self.add_default_extra_info(ie_result, ie, url) - if process: - return self.process_ie_result(ie_result, download, extra_info) - else: - return ie_result + return func(self, *args, **kwargs) except GeoRestrictedError as e: msg = e.msg if e.countries: @@ -815,20 +808,33 @@ class YoutubeDL(object): map(ISO3166Utils.short2full, e.countries)) msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' self.report_error(msg) - break except ExtractorError as e: # An error we somewhat expected self.report_error(compat_str(e), e.format_traceback()) - break except MaxDownloadsReached: raise except Exception as e: if self.params.get('ignoreerrors', False): self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) - break else: raise + return wrapper + + @__handle_extraction_exceptions + def __extract_info(self, url, ie, download, extra_info, process): + ie_result = ie.extract(url) + if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) + return + if isinstance(ie_result, list): + # Backwards compatibility: old IE result format + ie_result = { + '_type': 'compat_list', + 'entries': ie_result, + } + self.add_default_extra_info(ie_result, ie, url) + if process: + return self.process_ie_result(ie_result, download, extra_info) else: - self.report_error('no suitable InfoExtractor for URL %s' % url) + return ie_result def add_default_extra_info(self, ie_result, ie, url): self.add_extra_info(ie_result, { @@ -1003,9 +1009,8 @@ class YoutubeDL(object): self.to_screen('[download] ' + reason) continue - entry_result = self.process_ie_result(entry, - download=download, - extra_info=extra) + entry_result = self.__process_iterable_entry(entry, download, extra) + # TODO: skip failed (empty) entries? playlist_results.append(entry_result) ie_result['entries'] = playlist_results self.to_screen('[download] Finished downloading playlist: %s' % playlist) @@ -1034,6 +1039,11 @@ class YoutubeDL(object): else: raise Exception('Invalid result type: %s' % result_type) + @__handle_extraction_exceptions + def __process_iterable_entry(self, entry, download, extra_info): + return self.process_ie_result( + entry, download=download, extra_info=extra_info) + def _build_format_filter(self, filter_spec): " Returns a function to filter the formats according to the filter_spec " From a7e0531999fa3a7fef542942aaad8c55e22adec5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Nov 2020 04:22:21 +0700 Subject: [PATCH 65/87] [downloader/http] Fix crash during urlopen caused by missing reason of URLError --- youtube_dl/downloader/http.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 96379caf1..d8ac41dcc 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -109,7 +109,9 @@ class HttpFD(FileDownloader): try: ctx.data = self.ydl.urlopen(request) except (compat_urllib_error.URLError, ) as err: - if isinstance(err.reason, socket.timeout): + # reason may not be available, e.g. for urllib2.HTTPError on python 2.6 + reason = getattr(err, 'reason', None) + if isinstance(reason, socket.timeout): raise RetryDownload(err) raise err # When trying to resume, Content-Range HTTP header of response has to be checked From 51ca93d75162f554efd8d4cb5ac97864a38f7bee Mon Sep 17 00:00:00 2001 From: renalid <renalid@gmail.com> Date: Fri, 20 Nov 2020 22:44:08 +0100 Subject: [PATCH 66/87] [francetv] Update to fix thumbnail URL issue (#27120) Fix the thumbnail URL. The issue was here for many years, never fixed. It's done ! :-) Example : https://www.france.tv/france-2/de-gaulle-l-eclat-et-le-secret/de-gaulle-l-eclat-et-le-secret-saison-1/2035247-solitude.html failed thumbnail url generated : http://pluzz.francetv.fr/staticftv/ref_emissions/2020-11-02/EMI_1104da66f533cc7dc5d0d07a181a18c2e2fe1d81_20201014122553940.jpg right thumbnail url fixed : https://sivideo.webservices.francetelevisions.fr/staticftv/ref_emissions/2020-11-02/EMI_1104da66f533cc7dc5d0d07a181a18c2e2fe1d81_20201014122553940.jpg --- youtube_dl/extractor/francetv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 92f0851e3..3ca415077 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -211,7 +211,7 @@ class FranceTVIE(InfoExtractor): 'id': video_id, 'title': self._live_title(title) if is_live else title, 'description': clean_html(info.get('synopsis')), - 'thumbnail': urljoin('http://pluzz.francetv.fr', info.get('image')), + 'thumbnail': urljoin('https://sivideo.webservices.francetelevisions.fr', info.get('image')), 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')), 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])), 'is_live': is_live, From a1c88c4819233cf5f3734bcd6cf251d4339196ce Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 20 Nov 2020 23:23:55 +0100 Subject: [PATCH 67/87] [infoq] fix format extraction(closes #25984) --- youtube_dl/extractor/infoq.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 18249cf9b..0a70a1fb4 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -54,7 +54,7 @@ class InfoQIE(BokeCCBaseIE): def _extract_rtmp_video(self, webpage): # The server URL is hardcoded - video_url = 'rtmpe://video.infoq.com/cfx/st/' + video_url = 'rtmpe://videof.infoq.com/cfx/st/' # Extract video URL encoded_id = self._search_regex( @@ -86,17 +86,18 @@ class InfoQIE(BokeCCBaseIE): return [{ 'format_id': 'http_video', 'url': http_video_url, + 'http_headers': {'Referer': 'https://www.infoq.com/'}, }] def _extract_http_audio(self, webpage, video_id): - fields = self._hidden_inputs(webpage) + fields = self._form_hidden_inputs('mp3Form', webpage) http_audio_url = fields.get('filename') if not http_audio_url: return [] # base URL is found in the Location header in the response returned by # GET https://www.infoq.com/mp3download.action?filename=... when logged in. - http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) + http_audio_url = compat_urlparse.urljoin('http://ress.infoq.com/downloads/mp3downloads/', http_audio_url) http_audio_url = update_url_query(http_audio_url, self._extract_cf_auth(webpage)) # audio file seem to be missing some times even if there is a download link From e2096776b99e4b5b67aacd1bcc7807a3d3757236 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Nov 2020 20:48:13 +0700 Subject: [PATCH 68/87] [youtube:tab] Add support for current video and fix lives extraction (closes #27126) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/youtube.py | 114 ++++++++++++++--------------- 2 files changed, 55 insertions(+), 60 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 183050e07..5691c4cba 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1475,7 +1475,6 @@ from .yourupload import YourUploadIE from .youtube import ( YoutubeIE, YoutubeHistoryIE, - YoutubeLiveIE, YoutubeTabIE, YoutubePlaylistIE, YoutubeRecommendedIE, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index fd9e54c1f..484f79765 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2559,13 +2559,57 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', + 'info_dict': { + 'id': '9Auq9mYxFEE', + 'ext': 'mp4', + 'title': 'Watch Sky News live', + 'uploader': 'Sky News', + 'uploader_id': 'skynews', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', + 'upload_date': '20191102', + 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662', + 'categories': ['News & Politics'], + 'tags': list, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.youtube.com/user/TheYoungTurks/live', + 'info_dict': { + 'id': 'a48o2S1cPoo', + 'ext': 'mp4', + 'title': 'The Young Turks - Live Main Show', + 'uploader': 'The Young Turks', + 'uploader_id': 'TheYoungTurks', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', + 'upload_date': '20150715', + 'license': 'Standard YouTube License', + 'description': 'md5:438179573adcdff3c97ebb1ee632b891', + 'categories': ['News & Politics'], + 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/TheYoungTurks/live', + 'only_matching': True, }] - @classmethod - def suitable(cls, url): - return False if YoutubeLiveIE.suitable(url) else super( - YoutubeTabIE, cls).suitable(url) - def _extract_channel_id(self, webpage): channel_id = self._html_search_meta( 'channelId', webpage, 'channel id', default=None) @@ -2951,7 +2995,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) webpage = self._download_webpage(url, item_id) identity_token = self._search_regex( - r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, + r'\bID_TOKEN["\']\s*:\s/l*["\'](.+?)["\']', webpage, 'identity token', default=None) data = self._extract_yt_initial_data(item_id, webpage) tabs = try_get( @@ -2962,7 +3006,11 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) if playlist: return self._extract_from_playlist(item_id, data, playlist) - # Fallback to video extraction if no playlist alike page is recognized + # Fallback to video extraction if no playlist alike page is recognized. + # First check for the current video then try the v attribute of URL query. + video_id = try_get( + data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'], + compat_str) or video_id if video_id: return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) # Failed to recognize @@ -3083,58 +3131,6 @@ class YoutubeYtUserIE(InfoExtractor): ie=YoutubeTabIE.ie_key(), video_id=user_id) -class YoutubeLiveIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com live streams' - _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live' - IE_NAME = 'youtube:live' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/TheYoungTurks/live', - 'info_dict': { - 'id': 'a48o2S1cPoo', - 'ext': 'mp4', - 'title': 'The Young Turks - Live Main Show', - 'uploader': 'The Young Turks', - 'uploader_id': 'TheYoungTurks', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', - 'upload_date': '20150715', - 'license': 'Standard YouTube License', - 'description': 'md5:438179573adcdff3c97ebb1ee632b891', - 'categories': ['News & Politics'], - 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/TheYoungTurks/live', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - base_url = mobj.group('base_url') - webpage = self._download_webpage(url, channel_id, fatal=False) - if webpage: - page_type = self._og_search_property( - 'type', webpage, 'page type', default='') - video_id = self._html_search_meta( - 'videoId', webpage, 'video id', default=None) - if page_type.startswith('video') and video_id and re.match( - r'^[0-9A-Za-z_-]{11}$', video_id): - return self.url_result(video_id, YoutubeIE.ie_key()) - return self.url_result(base_url) - - class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com searches' # there doesn't appear to be a real limit, for example if you search for From 46a265a2da26c663463244ecf9a4a699c2cd6efc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Nov 2020 20:49:56 +0700 Subject: [PATCH 69/87] [youtube] Fix like and dislike count extraction (closes #25977) --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 484f79765..fb4c31326 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2196,8 +2196,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_count(count_name): return str_to_int(self._search_regex( - r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' - % re.escape(count_name), + (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name), + r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)), video_webpage, count_name, default=None)) like_count = _extract_count('like') From 21292c0649e956afc46bd39d774ec811d568de2a Mon Sep 17 00:00:00 2001 From: Daniel Peukert <daniel@peukert.cc> Date: Sat, 21 Nov 2020 15:52:20 +0100 Subject: [PATCH 70/87] [youtube] Fix error reason extraction (#27081) --- youtube_dl/extractor/youtube.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index fb4c31326..fb6d816cc 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2023,6 +2023,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): formats.append(a_format) else: error_message = extract_unavailable_message() + if not error_message: + reason_list = try_get( + player_response, + lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'], + list) or [] + for reason in reason_list: + if not isinstance(reason, dict): + continue + reason_text = try_get(reason, lambda x: x['text'], compat_str) + if reason_text: + if not error_message: + error_message = '' + error_message += reason_text + if error_message: + error_message = clean_html(error_message) if not error_message: error_message = clean_html(try_get( player_response, lambda x: x['playabilityStatus']['reason'], From 71ddc222adca21f471051e922d2c2f08a27b2d44 Mon Sep 17 00:00:00 2001 From: Josh Soref <jsoref@users.noreply.github.com> Date: Sat, 21 Nov 2020 10:00:05 -0500 Subject: [PATCH 71/87] Fix typos (#27084) * spelling: authorization Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: brightcove Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: creation Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: exceeded Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: exception Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: extension Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: extracting Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: extraction Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: frontline Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: improve Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: length Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: listsubtitles Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: multimedia Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: obfuscated Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: partitioning Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: playlist Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: playlists Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: restriction Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: services Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: split Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: srmediathek Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: support Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: thumbnail Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: verification Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> * spelling: whitespaces Signed-off-by: Josh Soref <jsoref@users.noreply.github.com> --- ChangeLog | 34 ++++++++++++++--------------- test/parameters.json | 2 +- youtube_dl/compat.py | 2 +- youtube_dl/extractor/brightcove.py | 8 +++---- youtube_dl/extractor/common.py | 2 +- youtube_dl/extractor/europa.py | 4 ++-- youtube_dl/extractor/generic.py | 2 +- youtube_dl/extractor/kusi.py | 4 ++-- youtube_dl/extractor/npr.py | 2 +- youtube_dl/extractor/pbs.py | 2 +- youtube_dl/extractor/soundcloud.py | 2 +- youtube_dl/extractor/tagesschau.py | 2 +- youtube_dl/extractor/theplatform.py | 2 +- youtube_dl/extractor/turner.py | 6 ++--- youtube_dl/extractor/vimeo.py | 4 ++-- youtube_dl/extractor/xiami.py | 8 +++---- youtube_dl/utils.py | 16 +++++++------- 17 files changed, 51 insertions(+), 51 deletions(-) diff --git a/ChangeLog b/ChangeLog index 572e0360d..db5dd488a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -367,7 +367,7 @@ Extractors + Add support for more domains * [svt] Fix series extraction (#22297) * [svt] Fix article extraction (#22897, #22919) -* [soundcloud] Imporve private playlist/set tracks extraction (#3707) +* [soundcloud] Improve private playlist/set tracks extraction (#3707) version 2020.01.24 @@ -493,7 +493,7 @@ Extractors * [abcotvs] Relax URL regular expression and improve metadata extraction (#18014) * [channel9] Reduce response size -* [adobetv] Improve extaction +* [adobetv] Improve extraction * Use OnDemandPagedList for list extractors * Reduce show extraction requests * Extract original video format and subtitles @@ -518,7 +518,7 @@ Extractors * [dailymotion] Improve extraction * Extract http formats included in m3u8 manifest * Fix user extraction (#3553, #21415) - + Add suport for User Authentication (#11491) + + Add support for User Authentication (#11491) * Fix password protected videos extraction (#23176) * Respect age limit option and family filter cookie value (#18437) * Handle video url playlist query param @@ -603,7 +603,7 @@ Extractors - [go90] Remove extractor * [kakao] Remove raw request + [kakao] Extract format total bitrate -* [daum] Fix VOD and Clip extracton (#15015) +* [daum] Fix VOD and Clip extraction (#15015) * [kakao] Improve extraction + Add support for embed URLs + Add support for Kakao Legacy vid based embed URLs @@ -647,7 +647,7 @@ Extractors * Improve format extraction (#22123) + Extract uploader_id and uploader_url (#21916) + Extract all known thumbnails (#19071, #20659) - * Fix extration for private playlists (#20976) + * Fix extraction for private playlists (#20976) + Add support for playlist embeds (#20976) * Skip preview formats (#22806) * [dplay] Improve extraction @@ -1122,7 +1122,7 @@ Extractors * [hbo] Fix extraction and extract subtitles (#14629, #13709) * [youtube] Extract srv[1-3] subtitle formats (#20566) * [adultswim] Fix extraction (#18025) -* [teamcoco] Fix extraction and add suport for subdomains (#17099, #20339) +* [teamcoco] Fix extraction and add support for subdomains (#17099, #20339) * [adn] Fix subtitle compatibility with ffmpeg * [adn] Fix extraction and add support for positioning styles (#20549) * [vk] Use unique video id (#17848) @@ -1534,7 +1534,7 @@ version 2018.11.18 Extractors + [wwe] Extract subtitles -+ [wwe] Add support for playlistst (#14781) ++ [wwe] Add support for playlists (#14781) + [wwe] Add support for wwe.com (#14781, #17450) * [vk] Detect geo restriction (#17767) * [openload] Use original host during extraction (#18211) @@ -2567,7 +2567,7 @@ Extractors * [youku] Update ccode (#14872) * [mnet] Fix format extraction (#14883) + [xiami] Add Referer header to API request -* [mtv] Correct scc extention in extracted subtitles (#13730) +* [mtv] Correct scc extension in extracted subtitles (#13730) * [vvvvid] Fix extraction for kenc videos (#13406) + [br] Add support for BR Mediathek videos (#14560, #14788) + [daisuki] Add support for motto.daisuki.com (#14681) @@ -2588,7 +2588,7 @@ Extractors * [nexx] Extract more formats + [openload] Add support for openload.link (#14763) * [empflix] Relax URL regular expression -* [empflix] Fix extractrion +* [empflix] Fix extraction * [tnaflix] Don't modify download URLs (#14811) - [gamersyde] Remove extractor * [francetv:generationwhat] Fix extraction @@ -2783,7 +2783,7 @@ Extractors * [yahoo] Bypass geo restriction for brightcove (#14210) * [yahoo] Use extracted brightcove account id (#14210) * [rtve:alacarta] Fix extraction (#14290) -+ [yahoo] Add support for custom brigthcove embeds (#14210) ++ [yahoo] Add support for custom brightcove embeds (#14210) + [generic] Add support for Video.js embeds + [gfycat] Add support for /gifs/detail URLs (#14322) * [generic] Fix infinite recursion for twitter:player URLs (#14339) @@ -3028,7 +3028,7 @@ Extractors * [amcnetworks] Make rating optional (#12453) * [cloudy] Fix extraction (#13737) + [nickru] Add support for nickelodeon.ru -* [mtv] Improve thumbnal extraction +* [mtv] Improve thumbnail extraction * [nick] Automate geo-restriction bypass (#13711) * [niconico] Improve error reporting (#13696) @@ -3392,7 +3392,7 @@ Extractors + [cda] Support birthday verification (#12789) * [leeco] Fix extraction (#12974) + [pbs] Extract chapters -* [amp] Imporove thumbnail and subtitles extraction +* [amp] Improve thumbnail and subtitles extraction * [foxsports] Fix extraction (#12945) - [coub] Remove comment count extraction (#12941) @@ -3562,7 +3562,7 @@ Extractors + [rbmaradio] Add support for redbullradio.com URLs (#12687) + [npo:live] Add support for default URL (#12555) * [mixcloud:playlist] Fix title, description and view count extraction (#12582) -+ [thesun] Add suport for thesun.co.uk (#11298, #12674) ++ [thesun] Add support for thesun.co.uk (#11298, #12674) + [ceskateleveize:porady] Add support for porady (#7411, #12645) * [ceskateleveize] Improve extraction and remove URL replacement hacks + [kaltura] Add support for iframe embeds (#12679) @@ -3601,7 +3601,7 @@ Extractors * [funimation] Fix extraction (#10696, #11773) + [xfileshare] Add support for vidabc.com (#12589) + [xfileshare] Improve extraction and extract hls formats -+ [crunchyroll] Pass geo verifcation proxy ++ [crunchyroll] Pass geo verification proxy + [cwtv] Extract ISM formats + [tvplay] Bypass geo restriction + [vrv] Add support for vrv.co @@ -3665,7 +3665,7 @@ Extractors + [bostonglobe] Add extractor for bostonglobe.com (#12099) + [toongoggles] Add support for toongoggles.com (#12171) + [medialaan] Add support for Medialaan sites (#9974, #11912) -+ [discoverynetworks] Add support for more domains and bypass geo restiction ++ [discoverynetworks] Add support for more domains and bypass geo restriction * [openload] Fix extraction (#10408) @@ -5255,7 +5255,7 @@ version 2016.07.09.1 Fixed/improved extractors - youtube - ard -- srmediatek (#9373) +- srmediathek (#9373) version 2016.07.09 @@ -5319,7 +5319,7 @@ Fixed/improved extractors - kaltura (#5557) - la7 - Changed features -- Rename --cn-verfication-proxy to --geo-verification-proxy +- Rename --cn-verification-proxy to --geo-verification-proxy Miscellaneous - Add script for displaying downloads statistics diff --git a/test/parameters.json b/test/parameters.json index 7bf59c25f..65fd54428 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -37,7 +37,7 @@ "writeinfojson": true, "writesubtitles": false, "allsubtitles": false, - "listssubtitles": false, + "listsubtitles": false, "socket_timeout": 20, "fixup": "never" } diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 0ee9bc760..6c3d49d45 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2345,7 +2345,7 @@ except ImportError: # Python <3.4 # HTMLParseError has been deprecated in Python 3.3 and removed in # Python 3.5. Introducing dummy exception for Python >3.5 for compatible - # and uniform cross-version exceptiong handling + # and uniform cross-version exception handling class compat_HTMLParseError(Exception): pass diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 2aa9f4782..000eac71c 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -147,7 +147,7 @@ class BrightcoveLegacyIE(InfoExtractor): ] @classmethod - def _build_brighcove_url(cls, object_str): + def _build_brightcove_url(cls, object_str): """ Build a Brightcove url from a xml string containing <object class="BrightcoveExperience">{params}</object> @@ -217,7 +217,7 @@ class BrightcoveLegacyIE(InfoExtractor): return cls._make_brightcove_url(params) @classmethod - def _build_brighcove_url_from_js(cls, object_js): + def _build_brightcove_url_from_js(cls, object_js): # The layout of JS is as follows: # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { # // build Brightcove <object /> XML @@ -272,12 +272,12 @@ class BrightcoveLegacyIE(InfoExtractor): ).+?>\s*</object>''', webpage) if matches: - return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + return list(filter(None, [cls._build_brightcove_url(m) for m in matches])) matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage) if matches: return list(filter(None, [ - cls._build_brighcove_url_from_js(custom_bc) + cls._build_brightcove_url_from_js(custom_bc) for custom_bc in matches])) return [src for _, src in re.findall( r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)] diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0c9089674..1a08c7616 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1664,7 +1664,7 @@ class InfoExtractor(object): # just the media without qualities renditions. # Fortunately, master playlist can be easily distinguished from media # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] - # master playlist tags MUST NOT appear in a media playist and vice versa. + # master playlist tags MUST NOT appear in a media playlist and vice versa. # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every # media playlist and MUST NOT appear in master playlist thus we can # clearly detect media playlist with this criterion. diff --git a/youtube_dl/extractor/europa.py b/youtube_dl/extractor/europa.py index 1efc0b2ec..2c1c747a1 100644 --- a/youtube_dl/extractor/europa.py +++ b/youtube_dl/extractor/europa.py @@ -60,7 +60,7 @@ class EuropaIE(InfoExtractor): title = get_item('title', preferred_langs) or video_id description = get_item('description', preferred_langs) - thumbnmail = xpath_text(playlist, './info/thumburl', 'thumbnail') + thumbnail = xpath_text(playlist, './info/thumburl', 'thumbnail') upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date')) duration = parse_duration(xpath_text(playlist, './info/duration', 'duration')) view_count = int_or_none(xpath_text(playlist, './info/views', 'views')) @@ -85,7 +85,7 @@ class EuropaIE(InfoExtractor): 'id': video_id, 'title': title, 'description': description, - 'thumbnail': thumbnmail, + 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d08a8cca5..f10f11244 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -842,7 +842,7 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, - # MTVSercices embed + # MTVServices embed { 'url': 'http://www.vulture.com/2016/06/new-key-peele-sketches-released.html', 'md5': 'ca1aef97695ef2c1d6973256a57e5252', diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py index 6a7e3baa7..9833d35eb 100644 --- a/youtube_dl/extractor/kusi.py +++ b/youtube_dl/extractor/kusi.py @@ -64,7 +64,7 @@ class KUSIIE(InfoExtractor): duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000) description = xpath_text(doc, 'ABSTRACT') thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME') - createtion_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) + creation_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content') formats = [] @@ -84,5 +84,5 @@ class KUSIIE(InfoExtractor): 'duration': duration, 'formats': formats, 'thumbnail': thumbnail, - 'timestamp': createtion_time, + 'timestamp': creation_time, } diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py index 53acc6e57..9d1122f0c 100644 --- a/youtube_dl/extractor/npr.py +++ b/youtube_dl/extractor/npr.py @@ -33,7 +33,7 @@ class NprIE(InfoExtractor): }, }], }, { - # mutlimedia, not media title + # multimedia, not media title 'url': 'https://www.npr.org/2017/06/19/533198237/tigers-jaw-tiny-desk-concert', 'info_dict': { 'id': '533198237', diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 4dbe661be..d4baa16ee 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -477,7 +477,7 @@ class PBSIE(InfoExtractor): if media_id: return media_id, presumptive_id, upload_date, description - # Fronline video embedded via flp + # Frontline video embedded via flp video_id = self._search_regex( r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None) if video_id: diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index a2fddf6d9..abb85e1e5 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -558,7 +558,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): def _extract_playlist(self, base_url, playlist_id, playlist_title): - # Per the SoundCloud documentation, the maximum limit for a linked partioning query is 200. + # Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200. # https://developers.soundcloud.com/blog/offset-pagination-deprecated COMMON_QUERY = { 'limit': 200, diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index c351b7545..8ceab7e35 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -86,7 +86,7 @@ class TagesschauPlayerIE(InfoExtractor): # return self._extract_via_api(kind, video_id) # JSON api does not provide some audio formats (e.g. ogg) thus - # extractiong audio via webpage + # extracting audio via webpage webpage = self._download_webpage(url, video_id) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 07055513a..41bfbe80f 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -208,7 +208,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): if m: return [m.group('url')] - # Are whitesapces ignored in URLs? + # Are whitespaces ignored in URLs? # https://github.com/ytdl-org/youtube-dl/issues/12044 matches = re.findall( r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage) diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index 4a6cbfbb8..2964504a2 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -56,9 +56,9 @@ class TurnerBaseIE(AdobePassIE): content_id = xpath_text(video_data, 'contentId') or video_id # rtmp_src = xpath_text(video_data, 'akamai/src') # if rtmp_src: - # splited_rtmp_src = rtmp_src.split(',') - # if len(splited_rtmp_src) == 2: - # rtmp_src = splited_rtmp_src[1] + # split_rtmp_src = rtmp_src.split(',') + # if len(split_rtmp_src) == 2: + # rtmp_src = split_rtmp_src[1] # aifp = xpath_text(video_data, 'akamai/aifp', default='') urls = [] diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index cfd04d50c..4c55946f1 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -922,7 +922,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): }] _PAGE_SIZE = 100 - def _fetch_page(self, album_id, authorizaion, hashed_pass, page): + def _fetch_page(self, album_id, authorization, hashed_pass, page): api_page = page + 1 query = { 'fields': 'link,uri', @@ -934,7 +934,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): videos = self._download_json( 'https://api.vimeo.com/albums/%s/videos' % album_id, album_id, 'Downloading page %d' % api_page, query=query, headers={ - 'Authorization': 'jwt ' + authorizaion, + 'Authorization': 'jwt ' + authorization, })['data'] for video in videos: link = video.get('link') diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py index 618da8382..769aab331 100644 --- a/youtube_dl/extractor/xiami.py +++ b/youtube_dl/extractor/xiami.py @@ -54,17 +54,17 @@ class XiamiBaseIE(InfoExtractor): def _decrypt(origin): n = int(origin[0]) origin = origin[1:] - short_lenth = len(origin) // n - long_num = len(origin) - short_lenth * n + short_length = len(origin) // n + long_num = len(origin) - short_length * n l = tuple() for i in range(0, n): - length = short_lenth + length = short_length if i < long_num: length += 1 l += (origin[0:length], ) origin = origin[length:] ans = '' - for i in range(0, short_lenth + 1): + for i in range(0, short_length + 1): for j in range(0, n): if len(l[j]) > i: ans += l[j][i] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 321f903ab..8cefafd79 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2458,7 +2458,7 @@ class XAttrMetadataError(YoutubeDLError): # Parsing code and msg if (self.code in (errno.ENOSPC, errno.EDQUOT) - or 'No space left' in self.msg or 'Disk quota excedded' in self.msg): + or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg): self.reason = 'NO_SPACE' elif self.code == errno.E2BIG or 'Argument list too long' in self.msg: self.reason = 'VALUE_TOO_LONG' @@ -4207,10 +4207,10 @@ def parse_codecs(codecs_str): # http://tools.ietf.org/html/rfc6381 if not codecs_str: return {} - splited_codecs = list(filter(None, map( + split_codecs = list(filter(None, map( lambda str: str.strip(), codecs_str.strip().strip(',').split(',')))) vcodec, acodec = None, None - for full_codec in splited_codecs: + for full_codec in split_codecs: codec = full_codec.split('.')[0] if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'): if not vcodec: @@ -4221,10 +4221,10 @@ def parse_codecs(codecs_str): else: write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr) if not vcodec and not acodec: - if len(splited_codecs) == 2: + if len(split_codecs) == 2: return { - 'vcodec': splited_codecs[0], - 'acodec': splited_codecs[1], + 'vcodec': split_codecs[0], + 'acodec': split_codecs[1], } else: return { @@ -5463,7 +5463,7 @@ def encode_base_n(num, n, table=None): def decode_packed_codes(code): mobj = re.search(PACKED_CODES_RE, code) - obfucasted_code, base, count, symbols = mobj.groups() + obfuscated_code, base, count, symbols = mobj.groups() base = int(base) count = int(count) symbols = symbols.split('|') @@ -5476,7 +5476,7 @@ def decode_packed_codes(code): return re.sub( r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], - obfucasted_code) + obfuscated_code) def caesar(s, alphabet, shift): From 0ada1b90b801e5d6ce713a25eccd7de21e7e4b6f Mon Sep 17 00:00:00 2001 From: Mattias Wadman <mattias.wadman@gmail.com> Date: Sat, 21 Nov 2020 17:24:37 +0100 Subject: [PATCH 72/87] [svt] Extract timestamp and thumbnail in more cases (#27130) Add timestamp, set to "valid from" which i think could been seen as publish time. Add thumbnail in more cases, seems to was only done in the embedded data case for some reason. Switch svtplay test url to an existing video and also one with no expire date. Also add an additional thumbnail url test regex. --- youtube_dl/extractor/svt.py | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 2f6887d86..9dd91c69a 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -9,6 +9,7 @@ from ..utils import ( determine_ext, dict_get, int_or_none, + unified_timestamp, str_or_none, strip_or_none, try_get, @@ -44,7 +45,8 @@ class SVTBaseIE(InfoExtractor): 'format_id': player_type, 'url': vurl, }) - if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): + rights = try_get(video_info, lambda x: x['rights'], dict) or {} + if not formats and rights.get('geoBlockedSweden'): self.raise_geo_restricted( 'This video is only available in Sweden', countries=self._GEO_COUNTRIES) @@ -70,6 +72,7 @@ class SVTBaseIE(InfoExtractor): episode = video_info.get('episodeTitle') episode_number = int_or_none(video_info.get('episodeNumber')) + timestamp = unified_timestamp(rights.get('validFrom')) duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration'))) age_limit = None adult = dict_get( @@ -84,6 +87,7 @@ class SVTBaseIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, 'duration': duration, + 'timestamp': timestamp, 'age_limit': age_limit, 'series': series, 'season_number': season_number, @@ -141,21 +145,30 @@ class SVTPlayIE(SVTPlayBaseIE): ) ''' _TESTS = [{ - 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', - 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', + 'url': 'https://www.svtplay.se/video/26194546/det-har-ar-himlen', + 'md5': '2382036fd6f8c994856c323fe51c426e', 'info_dict': { - 'id': '5996901', + 'id': 'jNwpV9P', 'ext': 'mp4', - 'title': 'Flygplan till Haile Selassie', - 'duration': 3527, - 'thumbnail': r're:^https?://.*[\.-]jpg$', + 'title': 'Det h\xe4r \xe4r himlen', + 'timestamp': 1586044800, + 'upload_date': '20200405', + 'duration': 3515, + 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', 'age_limit': 0, 'subtitles': { 'sv': [{ - 'ext': 'wsrt', + 'ext': 'vtt', }] }, }, + 'params': { + 'format': 'bestvideo', + # skip for now due to download test asserts that segment is > 10000 bytes and svt uses + # init segments that are smaller + # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B + 'skip_download': True, + }, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -236,7 +249,10 @@ class SVTPlayIE(SVTPlayBaseIE): r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'), webpage, 'video id') - return self._extract_by_video_id(svt_id, webpage) + info_dict = self._extract_by_video_id(svt_id, webpage) + info_dict['thumbnail'] = thumbnail + + return info_dict class SVTSeriesIE(SVTPlayBaseIE): From 049f2242480fd481ddb6a88c5a9d7f360f890d7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Nov 2020 23:35:01 +0700 Subject: [PATCH 73/87] [svtplay] Add support for svt.se/barnkanalen (closes #24817) --- youtube_dl/extractor/svt.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 9dd91c69a..17bd4acdc 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -140,7 +140,11 @@ class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' _VALID_URL = r'''(?x) (?: - svt:(?P<svt_id>[^/?#&]+)| + (?: + svt:| + https?://(?:www\.)?svt\.se/barnkanalen/barnplay/[^/]+/ + ) + (?P<svt_id>[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+) ) ''' @@ -185,6 +189,12 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'svt:14278044', 'only_matching': True, + }, { + 'url': 'https://www.svt.se/barnkanalen/barnplay/kar/eWv5MLX/', + 'only_matching': True, + }, { + 'url': 'svt:eWv5MLX', + 'only_matching': True, }] def _adjust_title(self, info): @@ -376,7 +386,7 @@ class SVTPageIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) + return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPageIE, cls).suitable(url) def _real_extract(self, url): path, display_id = re.match(self._VALID_URL, url).groups() From 1e72660c9b2bcd18c688cd8786f81acaa7ad088e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Nov 2020 23:36:25 +0700 Subject: [PATCH 74/87] [svtplay] Fix test title --- youtube_dl/extractor/svt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 17bd4acdc..a0b6ef4db 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -154,7 +154,7 @@ class SVTPlayIE(SVTPlayBaseIE): 'info_dict': { 'id': 'jNwpV9P', 'ext': 'mp4', - 'title': 'Det h\xe4r \xe4r himlen', + 'title': 'Det här är himlen', 'timestamp': 1586044800, 'upload_date': '20200405', 'duration': 3515, From 82abc13aedb42d77afcb0ca9c1d7982955826b66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Nov 2020 23:41:49 +0700 Subject: [PATCH 75/87] [youtube:tab] Comment out test --- youtube_dl/extractor/youtube.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index fb6d816cc..2aad855c8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2620,10 +2620,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/TheYoungTurks/live', - 'only_matching': True, - }] + }, + # TODO + # { + # 'url': 'https://www.youtube.com/TheYoungTurks/live', + # 'only_matching': True, + # } + ] def _extract_channel_id(self, webpage): channel_id = self._html_search_meta( From 5d8cb4367dc5f8c5739bfa4e807d2a8b301dfe4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Nov 2020 23:54:33 +0700 Subject: [PATCH 76/87] release 2020.11.21 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- docs/supportedsites.md | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index e2e5a15ec..1601ee725 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.19. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.21. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.11.19** +- [ ] I've verified that I'm running youtube-dl version **2020.11.21** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.19 + [debug] youtube-dl version 2020.11.21 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 880a96835..1e8bc7d82 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.19. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.21. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.11.19** +- [ ] I've verified that I'm running youtube-dl version **2020.11.21** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 25c5a7daf..611d7cab9 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.19. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.21. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.19** +- [ ] I've verified that I'm running youtube-dl version **2020.11.21** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 59716f962..aaef901ad 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.19. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.21. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.11.19** +- [ ] I've verified that I'm running youtube-dl version **2020.11.21** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.19 + [debug] youtube-dl version 2020.11.21 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 410abee90..86c2f1e32 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.19. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.21. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.19** +- [ ] I've verified that I'm running youtube-dl version **2020.11.21** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 37425e7a1..53dbc28e5 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -41,6 +41,7 @@ - **AlJazeera** - **Allocine** - **AlphaPorno** + - **Amara** - **AMCNetworks** - **AmericasTestKitchen** - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl @@ -1130,7 +1131,6 @@ - **YourUpload** - **youtube**: YouTube.com - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) - - **youtube:live**: YouTube.com live streams - **youtube:playlist**: YouTube.com playlists - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) - **youtube:search**: YouTube.com searches diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 53eeb6ccf..9ede0334a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.11.19' +__version__ = '2020.11.21' From 650bd8f6231b3302dadcddf336748a9eb1dc85ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Nov 2020 23:52:39 +0700 Subject: [PATCH 77/87] [ChangeLog] Actualize [ci skip] --- ChangeLog | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/ChangeLog b/ChangeLog index db5dd488a..a108e8734 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,34 @@ +version <unreleased> + +Core +* [downloader/http] Fix crash during urlopen caused by missing reason + of URLError +* [YoutubeDL] Fix --ignore-errors for playlists with generator-based entries + of url_transparent (#27064) + +Extractors ++ [svtplay] Add support for svt.se/barnkanalen (#24817) ++ [svt] Extract timestamp (#27130) +* [svtplay] Improve thumbnail extraction (#27130) +* [youtube] Fix error reason extraction (#27081) +* [youtube] Fix like and dislike count extraction (#25977) ++ [youtube:tab] Add support for current video and fix lives extraction (#27126) +* [infoq] Fix format extraction (#25984) +* [francetv] Update to fix thumbnail URL issue (#27120) +* [youtube] Improve yt initial data extraction (#27093) ++ [discoverynetworks] Add support new TLC/DMAX URLs (#27100) +* [rai] Fix protocol relative relinker URLs (#22766) +* [rai] Fix unavailable video format detection +* [rai] Improve extraction +* [rai] Fix extraction (#27077) +* [viki] Improve format extraction +* [viki] Fix stream extraction from MPD (#27092) +* [googledrive] Fix format extraction (#26979) ++ [amara] Add support for amara.org (#20618) +* [vimeo:album] Fix extraction (#27079) +* [mtv] Fix mgid extraction (#26841) + + version 2020.11.19 Core From f23eceebbfa0aa26f7ff598026ee6029233148b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Nov 2020 23:59:11 +0700 Subject: [PATCH 78/87] release 2020.11.21.1 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 1601ee725..e2cbaee1c 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.21. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.21.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.11.21** +- [ ] I've verified that I'm running youtube-dl version **2020.11.21.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.21 + [debug] youtube-dl version 2020.11.21.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 1e8bc7d82..e531e89c0 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.21. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.21.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.11.21** +- [ ] I've verified that I'm running youtube-dl version **2020.11.21.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 611d7cab9..686878c56 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.21. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.21.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.21** +- [ ] I've verified that I'm running youtube-dl version **2020.11.21.1** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index aaef901ad..17b853716 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.21. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.21.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.11.21** +- [ ] I've verified that I'm running youtube-dl version **2020.11.21.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.21 + [debug] youtube-dl version 2020.11.21.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 86c2f1e32..109c47925 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.21. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.11.21.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.21** +- [ ] I've verified that I'm running youtube-dl version **2020.11.21.1** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index a108e8734..5a08b3a66 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2020.11.21.1 Core * [downloader/http] Fix crash during urlopen caused by missing reason diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9ede0334a..d7e901521 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.11.21' +__version__ = '2020.11.21.1' From c4cabf040e1bf37aec69a3ff45594ac6965ba139 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 22 Nov 2020 05:04:01 +0700 Subject: [PATCH 79/87] [pinterest] Add extractor (closes #25747) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/pinterest.py | 176 +++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 youtube_dl/extractor/pinterest.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5691c4cba..ba11f12b9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -844,6 +844,10 @@ from .picarto import ( ) from .piksel import PikselIE from .pinkbike import PinkbikeIE +from .pinterest import ( + PinterestIE, + PinterestCollectionIE, +) from .pladform import PladformIE from .platzi import ( PlatziIE, diff --git a/youtube_dl/extractor/pinterest.py b/youtube_dl/extractor/pinterest.py new file mode 100644 index 000000000..2bb4ca660 --- /dev/null +++ b/youtube_dl/extractor/pinterest.py @@ -0,0 +1,176 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +class PinterestBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' + + def _extract_resource(self, webpage, video_id): + return self._parse_json( + self._search_regex( + r'<script[^>]+\bid=["\']initial-state["\'][^>]*>({.+?})</script>', + webpage, 'application json'), + video_id)['resourceResponses'] + + def _extract_video(self, data, extract_formats=True): + video_id = data['id'] + + title = (data.get('title') or data.get('grid_title') or video_id).strip() + + formats = [] + duration = None + if extract_formats: + for format_id, format_dict in data['videos']['video_list'].items(): + if not isinstance(format_dict, dict): + continue + format_url = url_or_none(format_dict.get('url')) + if not format_url: + continue + duration = float_or_none(format_dict.get('duration'), scale=1000) + ext = determine_ext(format_url) + if 'hls' in format_id.lower() or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'width': int_or_none(format_dict.get('width')), + 'height': int_or_none(format_dict.get('height')), + 'duration': duration, + }) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) + + description = data.get('description') or data.get('description_html') or data.get('seo_description') + timestamp = unified_timestamp(data.get('created_at')) + + def _u(field): + return try_get(data, lambda x: x['closeup_attribution'][field], compat_str) + + uploader = _u('full_name') + uploader_id = _u('id') + + repost_count = int_or_none(data.get('repin_count')) + comment_count = int_or_none(data.get('comment_count')) + categories = try_get(data, lambda x: x['pin_join']['visual_annotation'], list) + tags = data.get('hashtags') + + thumbnails = [] + images = data.get('images') + if isinstance(images, dict): + for thumbnail_id, thumbnail in images.items(): + if not isinstance(thumbnail, dict): + continue + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'repost_count': repost_count, + 'comment_count': comment_count, + 'categories': categories, + 'tags': tags, + 'formats': formats, + 'extractor_key': PinterestIE.ie_key(), + } + + +class PinterestIE(PinterestBaseIE): + _VALID_URL = r'%s/pin/(?P<id>\d+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.com/pin/664281013778109217/', + 'md5': '6550c2af85d6d9f3fe3b88954d1577fc', + 'info_dict': { + 'id': '664281013778109217', + 'ext': 'mp4', + 'title': 'Origami', + 'description': 'md5:b9d90ddf7848e897882de9e73344f7dd', + 'duration': 57.7, + 'timestamp': 1593073622, + 'upload_date': '20200625', + 'uploader': 'Love origami -I am Dafei', + 'uploader_id': '586523688879454212', + 'repost_count': 50, + 'comment_count': 0, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://co.pinterest.com/pin/824721750502199491/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._extract_resource(webpage, video_id)[0]['response']['data'] + return self._extract_video(data) + + +class PinterestCollectionIE(PinterestBaseIE): + _VALID_URL = r'%s/[^/]+/(?P<id>[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', + 'info_dict': { + 'id': '585890301462791043', + 'title': 'cool diys', + }, + 'playlist_count': 8, + }] + + @classmethod + def suitable(cls, url): + return False if PinterestIE.suitable(url) else super( + PinterestCollectionIE, cls).suitable(url) + + def _real_extract(self, url): + collection_name = self._match_id(url) + webpage = self._download_webpage(url, collection_name) + resource = self._extract_resource(webpage, collection_name)[1] + entries = [] + for item in resource['response']['data']: + if not isinstance(item, dict) or item.get('type') != 'pin': + continue + video_id = item.get('id') + if video_id: + # Some pins may not be available anonymously via pin URL + # video = self._extract_video(item, extract_formats=False) + # video.update({ + # '_type': 'url_transparent', + # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, + # }) + # entries.append(video) + entries.append(self._extract_video(item)) + title = try_get( + resource, lambda x: x['options']['board_title'], compat_str) + collection_id = try_get( + resource, lambda x: x['options']['board_id'], + compat_str) or collection_name + return self.playlist_result( + entries, playlist_id=collection_id, playlist_title=title) From 193422e12a98ebcc49a215cf3667c7fce593f25c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 22 Nov 2020 12:54:55 +0100 Subject: [PATCH 80/87] [extractor/common] add generic support for akamai http format extraction --- youtube_dl/extractor/common.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1a08c7616..16aff885c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2596,6 +2596,7 @@ class InfoExtractor(object): def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): formats = [] + hdcore_sign = 'hdcore=3.7.0' f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') hds_host = hosts.get('hds') @@ -2608,6 +2609,7 @@ class InfoExtractor(object): for entry in f4m_formats: entry.update({'extra_param_to_segment_url': hdcore_sign}) formats.extend(f4m_formats) + m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') hls_host = hosts.get('hls') if hls_host: @@ -2615,6 +2617,31 @@ class InfoExtractor(object): formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + http_host = hosts.get('http') + if http_host and 'hdnea=' not in manifest_url: + REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+' + qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') + qualities_length = len(qualities) + if len(formats) in (qualities_length + 1, qualities_length * 2 + 1): + i = 0 + http_formats = [] + for f in formats: + if f['protocol'] == 'm3u8_native' and f['vcodec'] != 'none': + for protocol in ('http', 'https'): + http_f = f.copy() + del http_f['manifest_url'] + http_url = re.sub( + REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url']) + http_f.update({ + 'format_id': http_f['format_id'].replace('hls-', protocol + '-'), + 'url': http_url, + 'protocol': protocol, + }) + http_formats.append(http_f) + i += 1 + formats.extend(http_formats) + return formats def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): From e9cbb98a0f63e08cf7c42d1612450e2534c8de7e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 22 Nov 2020 13:01:56 +0100 Subject: [PATCH 81/87] [skyit] add support for multiple Sky Italia websites(closes #26629) --- youtube_dl/extractor/extractors.py | 10 ++ youtube_dl/extractor/skyit.py | 239 +++++++++++++++++++++++++++++ 2 files changed, 249 insertions(+) create mode 100644 youtube_dl/extractor/skyit.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ba11f12b9..356f4cc6b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1013,6 +1013,16 @@ from .shared import ( from .showroomlive import ShowRoomLiveIE from .sina import SinaIE from .sixplay import SixPlayIE +from .skyit import ( + SkyItPlayerIE, + SkyItVideoIE, + SkyItVideoLiveIE, + SkyItIE, + SkyItAcademyIE, + SkyItArteIE, + CieloTVItIE, + TV8ItIE, +) from .skylinewebcams import SkylineWebcamsIE from .skynewsarabia import ( SkyNewsArabiaIE, diff --git a/youtube_dl/extractor/skyit.py b/youtube_dl/extractor/skyit.py new file mode 100644 index 000000000..14a4d8d4c --- /dev/null +++ b/youtube_dl/extractor/skyit.py @@ -0,0 +1,239 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + dict_get, + int_or_none, + parse_duration, + unified_timestamp, +) + + +class SkyItPlayerIE(InfoExtractor): + IE_NAME = 'player.sky.it' + _VALID_URL = r'https?://player\.sky\.it/player/(?:external|social)\.html\?.*?\bid=(?P<id>\d+)' + _GEO_BYPASS = False + _DOMAIN = 'sky' + _PLAYER_TMPL = 'https://player.sky.it/player/external.html?id=%s&domain=%s' + # http://static.sky.it/static/skyplayer/conf.json + _TOKEN_MAP = { + 'cielo': 'Hh9O7M8ks5yi6nSROL7bKYz933rdf3GhwZlTLMgvy4Q', + 'hotclub': 'kW020K2jq2lk2eKRJD2vWEg832ncx2EivZlTLQput2C', + 'mtv8': 'A5Nn9GGb326CI7vP5e27d7E4PIaQjota', + 'salesforce': 'C6D585FD1615272C98DE38235F38BD86', + 'sitocommerciale': 'VJwfFuSGnLKnd9Phe9y96WkXgYDCguPMJ2dLhGMb2RE', + 'sky': 'F96WlOd8yoFmLQgiqv6fNQRvHZcsWk5jDaYnDvhbiJk', + 'skyacademy': 'A6LAn7EkO2Q26FRy0IAMBekX6jzDXYL3', + 'skyarte': 'LWk29hfiU39NNdq87ePeRach3nzTSV20o0lTv2001Cd', + 'theupfront': 'PRSGmDMsg6QMGc04Obpoy7Vsbn7i2Whp', + } + + def _player_url_result(self, video_id): + return self.url_result( + self._PLAYER_TMPL % (video_id, self._DOMAIN), + SkyItPlayerIE.ie_key(), video_id) + + def _parse_video(self, video, video_id): + title = video['title'] + is_live = video.get('type') == 'live' + hls_url = video.get(('streaming' if is_live else 'hls') + '_url') + if not hls_url and video.get('geoblock' if is_live else 'geob'): + self.raise_geo_restricted(countries=['IT']) + + if is_live: + formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4') + else: + formats = self._extract_akamai_formats( + hls_url, video_id, {'http': 'videoplatform.sky.it'}) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'formats': formats, + 'thumbnail': dict_get(video, ('video_still', 'video_still_medium', 'thumb')), + 'description': video.get('short_desc') or None, + 'timestamp': unified_timestamp(video.get('create_date')), + 'duration': int_or_none(video.get('duration_sec')) or parse_duration(video.get('duration')), + 'is_live': is_live, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + domain = compat_parse_qs(compat_urllib_parse_urlparse( + url).query).get('domain', [None])[0] + token = dict_get(self._TOKEN_MAP, (domain, 'sky')) + video = self._download_json( + 'https://apid.sky.it/vdp/v1/getVideoData', + video_id, query={ + 'caller': 'sky', + 'id': video_id, + 'token': token + }, headers=self.geo_verification_headers()) + return self._parse_video(video, video_id) + + +class SkyItVideoIE(SkyItPlayerIE): + IE_NAME = 'video.sky.it' + _VALID_URL = r'https?://(?:masterchef|video|xfactor)\.sky\.it(?:/[^/]+)*/video/[0-9a-z-]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://video.sky.it/news/mondo/video/uomo-ucciso-da-uno-squalo-in-australia-631227', + 'md5': 'fe5c91e59a84a3437eaa0bca6e134ccd', + 'info_dict': { + 'id': '631227', + 'ext': 'mp4', + 'title': 'Uomo ucciso da uno squalo in Australia', + 'timestamp': 1606036192, + 'upload_date': '20201122', + } + }, { + 'url': 'https://xfactor.sky.it/video/x-factor-2020-replay-audizioni-1-615820', + 'only_matching': True, + }, { + 'url': 'https://masterchef.sky.it/video/masterchef-9-cosa-e-successo-nella-prima-puntata-562831', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._player_url_result(video_id) + + +class SkyItVideoLiveIE(SkyItPlayerIE): + IE_NAME = 'video.sky.it:live' + _VALID_URL = r'https?://video\.sky\.it/diretta/(?P<id>[^/?&#]+)' + _TEST = { + 'url': 'https://video.sky.it/diretta/tg24', + 'info_dict': { + 'id': '1', + 'ext': 'mp4', + 'title': r're:Diretta TG24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'description': 'Guarda la diretta streaming di SkyTg24, segui con Sky tutti gli appuntamenti e gli speciali di Tg24.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + asset_id = compat_str(self._parse_json(self._search_regex( + r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', + webpage, 'next data'), display_id)['props']['initialState']['livePage']['content']['asset_id']) + livestream = self._download_json( + 'https://apid.sky.it/vdp/v1/getLivestream', + asset_id, query={'id': asset_id}) + return self._parse_video(livestream, asset_id) + + +class SkyItIE(SkyItPlayerIE): + IE_NAME = 'sky.it' + _VALID_URL = r'https?://(?:sport|tg24)\.sky\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://sport.sky.it/calcio/serie-a/2020/11/21/juventus-cagliari-risultato-gol', + 'info_dict': { + 'id': '631201', + 'ext': 'mp4', + 'title': 'Un rosso alla violenza: in campo per i diritti delle donne', + 'upload_date': '20201121', + 'timestamp': 1605995753, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'https://tg24.sky.it/mondo/2020/11/22/australia-squalo-uccide-uomo', + 'md5': 'fe5c91e59a84a3437eaa0bca6e134ccd', + 'info_dict': { + 'id': '631227', + 'ext': 'mp4', + 'title': 'Uomo ucciso da uno squalo in Australia', + 'timestamp': 1606036192, + 'upload_date': '20201122', + }, + }] + _VIDEO_ID_REGEX = r'data-videoid="(\d+)"' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + self._VIDEO_ID_REGEX, webpage, 'video id') + return self._player_url_result(video_id) + + +class SkyItAcademyIE(SkyItIE): + IE_NAME = 'skyacademy.it' + _VALID_URL = r'https?://(?:www\.)?skyacademy\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.skyacademy.it/eventi-speciali/2019/07/05/a-lezione-di-cinema-con-sky-academy-/', + 'md5': 'ced5c26638b7863190cbc44dd6f6ba08', + 'info_dict': { + 'id': '523458', + 'ext': 'mp4', + 'title': 'Sky Academy "The Best CineCamp 2019"', + 'timestamp': 1562843784, + 'upload_date': '20190711', + } + }] + _DOMAIN = 'skyacademy' + _VIDEO_ID_REGEX = r'id="news-videoId_(\d+)"' + + +class SkyItArteIE(SkyItIE): + IE_NAME = 'arte.sky.it' + _VALID_URL = r'https?://arte\.sky\.it/video/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://arte.sky.it/video/serie-musei-venezia-collezionismo-12-novembre/', + 'md5': '515aee97b87d7a018b6c80727d3e7e17', + 'info_dict': { + 'id': '627926', + 'ext': 'mp4', + 'title': "Musei Galleria Franchetti alla Ca' d'Oro Palazzo Grimani", + 'upload_date': '20201106', + 'timestamp': 1604664493, + } + }] + _DOMAIN = 'skyarte' + _VIDEO_ID_REGEX = r'(?s)<iframe[^>]+src="(?:https:)?//player\.sky\.it/player/external\.html\?[^"]*\bid=(\d+)' + + +class CieloTVItIE(SkyItIE): + IE_NAME = 'cielotv.it' + _VALID_URL = r'https?://(?:www\.)?cielotv\.it/video/(?P<id>[^.]+)\.html' + _TESTS = [{ + 'url': 'https://www.cielotv.it/video/Il-lunedi-e-sempre-un-dramma.html', + 'md5': 'c4deed77552ba901c2a0d9258320304b', + 'info_dict': { + 'id': '499240', + 'ext': 'mp4', + 'title': 'Il lunedì è sempre un dramma', + 'upload_date': '20190329', + 'timestamp': 1553862178, + } + }] + _DOMAIN = 'cielo' + _VIDEO_ID_REGEX = r'videoId\s*=\s*"(\d+)"' + + +class TV8ItIE(SkyItVideoIE): + IE_NAME = 'tv8.it' + _VALID_URL = r'https?://tv8\.it/showvideo/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://tv8.it/showvideo/630529/ogni-mattina-ucciso-asino-di-andrea-lo-cicero/18-11-2020/', + 'md5': '9ab906a3f75ea342ed928442f9dabd21', + 'info_dict': { + 'id': '630529', + 'ext': 'mp4', + 'title': 'Ogni mattina - Ucciso asino di Andrea Lo Cicero', + 'timestamp': 1605721374, + 'upload_date': '20201118', + } + }] + _DOMAIN = 'mtv8' From 9d531aa2918067570e4827fcced59c60accac220 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 22 Nov 2020 13:07:04 +0100 Subject: [PATCH 82/87] [rumble] add support for embed pages(#10785) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/rumble.py | 67 ++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 youtube_dl/extractor/rumble.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 356f4cc6b..31fb4c95a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -963,6 +963,7 @@ from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETe from .rtvnh import RTVNHIE from .rtvs import RTVSIE from .ruhd import RUHDIE +from .rumble import RumbleEmbedIE from .rutube import ( RutubeIE, RutubeChannelIE, diff --git a/youtube_dl/extractor/rumble.py b/youtube_dl/extractor/rumble.py new file mode 100644 index 000000000..4a0225109 --- /dev/null +++ b/youtube_dl/extractor/rumble.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + parse_iso8601, + try_get, +) + + +class RumbleEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)' + _TESTS = [{ + 'url': 'https://rumble.com/embed/v5pv5f', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + } + }, { + 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._download_json( + 'https://rumble.com/embedJS/', video_id, + query={'request': 'video', 'v': video_id}) + title = video['title'] + + formats = [] + for height, ua in (video.get('ua') or {}).items(): + for i in range(2): + f_url = try_get(ua, lambda x: x[i], compat_str) + if f_url: + ext = determine_ext(f_url) + f = { + 'ext': ext, + 'format_id': '%s-%sp' % (ext, height), + 'height': int_or_none(height), + 'url': f_url, + } + bitrate = try_get(ua, lambda x: x[i + 2]['bitrate']) + if bitrate: + f['tbr'] = int_or_none(bitrate) + formats.append(f) + self._sort_formats(formats) + + author = video.get('author') or {} + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': video.get('i'), + 'timestamp': parse_iso8601(video.get('pubDate')), + 'channel': author.get('name'), + 'channel_url': author.get('url'), + 'duration': int_or_none(video.get('duration')), + } From cb6e24f946023e04469acb00174dfd71c2fa518d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 22 Nov 2020 13:16:03 +0100 Subject: [PATCH 83/87] [lbry] relax _VALID_URL regex(closes #27144) --- youtube_dl/extractor/lbry.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index 0a7ee919c..6177297ab 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -16,7 +16,7 @@ from ..utils import ( class LBRYIE(InfoExtractor): IE_NAME = 'lbry.tv' - _VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[0-9a-zA-Z-]+:[0-9a-z]+/[0-9a-zA-Z().-]+:[0-9a-z])' + _VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[^:]+:[0-9a-z]+/[^:]+:[0-9a-z])' _TESTS = [{ # Video 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', @@ -44,6 +44,9 @@ class LBRYIE(InfoExtractor): }, { 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', 'only_matching': True, + }, { + 'url': "https://odysee.com/@ScammerRevolts:b0/I-SYSKEY'D-THE-SAME-SCAMMERS-3-TIMES!:b", + 'only_matching': True, }] def _call_api_proxy(self, method, display_id, params): From 15f27347911e51954184aa483a77c98eaea2c399 Mon Sep 17 00:00:00 2001 From: Jia Rong Yee <28086837+fourjr@users.noreply.github.com> Date: Sun, 22 Nov 2020 21:12:47 +0800 Subject: [PATCH 84/87] [nytimes] Add new cooking.nytimes.com extractor (#27143) * [nytimes] support cooking.nytimes.com, resolves #27112 Co-authored-by: remitamine <remitamine@gmail.com> --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/nytimes.py | 38 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 31fb4c95a..fb18a0563 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -782,6 +782,7 @@ from .ntvru import NTVRuIE from .nytimes import ( NYTimesIE, NYTimesArticleIE, + NYTimesCookingIE, ) from .nuvid import NuvidIE from .nzz import NZZIE diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index fc78ca56c..976b1c694 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -221,3 +221,41 @@ class NYTimesArticleIE(NYTimesBaseIE): r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'), webpage, 'podcast data') return self._extract_podcast_from_json(podcast_data, page_id, webpage) + + +class NYTimesCookingIE(NYTimesBaseIE): + _VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart', + 'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3', + 'info_dict': { + 'id': '100000004756089', + 'ext': 'mov', + 'timestamp': 1479383008, + 'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON', + 'title': 'Cranberry Tart', + 'upload_date': '20161117', + 'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.', + }, + }, { + 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey', + 'md5': '4b2e8c70530a89b8d905a2b572316eb8', + 'info_dict': { + 'id': '100000003951728', + 'ext': 'mov', + 'timestamp': 1445509539, + 'description': 'Turkey guide', + 'upload_date': '20151022', + 'title': 'Turkey', + } + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + + webpage = self._download_webpage(url, page_id) + + video_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'video id') + + return self._extract_video_from_id(video_id) From c84f9475b8df0f892b631966159e1649dafe13f0 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 22 Nov 2020 17:39:41 +0100 Subject: [PATCH 85/87] [box] Add new extractor(#5949) --- youtube_dl/extractor/box.py | 98 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 99 insertions(+) create mode 100644 youtube_dl/extractor/box.py diff --git a/youtube_dl/extractor/box.py b/youtube_dl/extractor/box.py new file mode 100644 index 000000000..aae82d1af --- /dev/null +++ b/youtube_dl/extractor/box.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + parse_iso8601, + # try_get, + update_url_query, +) + + +class BoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/]+)/file/(?P<id>\d+)' + _TEST = { + 'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538', + 'md5': '1f81b2fd3960f38a40a3b8823e5fcd43', + 'info_dict': { + 'id': '510727257538', + 'ext': 'mp4', + 'title': 'Garber St. Louis will be 28th MLS team +scarving.mp4', + 'uploader': 'MLS Video', + 'timestamp': 1566320259, + 'upload_date': '20190820', + 'uploader_id': '235196876', + } + } + + def _real_extract(self, url): + shared_name, file_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, file_id) + request_token = self._parse_json(self._search_regex( + r'Box\.config\s*=\s*({.+?});', webpage, + 'Box config'), file_id)['requestToken'] + access_token = self._download_json( + 'https://app.box.com/app-api/enduserapp/elements/tokens', file_id, + 'Downloading token JSON metadata', + data=json.dumps({'fileIDs': [file_id]}).encode(), headers={ + 'Content-Type': 'application/json', + 'X-Request-Token': request_token, + 'X-Box-EndUser-API': 'sharedName=' + shared_name, + })[file_id]['read'] + shared_link = 'https://app.box.com/s/' + shared_name + f = self._download_json( + 'https://api.box.com/2.0/files/' + file_id, file_id, + 'Downloading file JSON metadata', headers={ + 'Authorization': 'Bearer ' + access_token, + 'BoxApi': 'shared_link=' + shared_link, + 'X-Rep-Hints': '[dash]', # TODO: extract `hls` formats + }, query={ + 'fields': 'authenticated_download_url,created_at,created_by,description,extension,is_download_available,name,representations,size' + }) + title = f['name'] + + query = { + 'access_token': access_token, + 'shared_link': shared_link + } + + formats = [] + + # for entry in (try_get(f, lambda x: x['representations']['entries'], list) or []): + # entry_url_template = try_get( + # entry, lambda x: x['content']['url_template']) + # if not entry_url_template: + # continue + # representation = entry.get('representation') + # if representation == 'dash': + # TODO: append query to every fragment URL + # formats.extend(self._extract_mpd_formats( + # entry_url_template.replace('{+asset_path}', 'manifest.mpd'), + # file_id, query=query)) + + authenticated_download_url = f.get('authenticated_download_url') + if authenticated_download_url and f.get('is_download_available'): + formats.append({ + 'ext': f.get('extension') or determine_ext(title), + 'filesize': f.get('size'), + 'format_id': 'download', + 'url': update_url_query(authenticated_download_url, query), + }) + + self._sort_formats(formats) + + creator = f.get('created_by') or {} + + return { + 'id': file_id, + 'title': title, + 'formats': formats, + 'description': f.get('description') or None, + 'uploader': creator.get('name'), + 'timestamp': parse_iso8601(f.get('created_at')), + 'uploader_id': creator.get('id'), + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index fb18a0563..7ba4087fd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -122,6 +122,7 @@ from .blinkx import BlinkxIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bostonglobe import BostonGlobeIE +from .box import BoxIE from .bpb import BpbIE from .br import ( BRIE, From dd0f524c69ad95541f7d370bdb877ee68f722f26 Mon Sep 17 00:00:00 2001 From: renalid <renalid@gmail.com> Date: Sun, 22 Nov 2020 19:35:53 +0100 Subject: [PATCH 86/87] [franceinter] add thumbnail url (#27153) Co-authored-by: remitamine <remitamine@gmail.com> --- youtube_dl/extractor/franceinter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 05806895c..a009f4d38 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -16,6 +16,7 @@ class FranceInterIE(InfoExtractor): 'ext': 'mp3', 'title': 'Affaire Cahuzac : le contentieux du compte en Suisse', 'description': 'md5:401969c5d318c061f86bda1fa359292b', + 'thumbnail': r're:^https?://.*\.jpg', 'upload_date': '20160907', }, } @@ -31,6 +32,7 @@ class FranceInterIE(InfoExtractor): title = self._og_search_title(webpage) description = self._og_search_description(webpage) + thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) upload_date_str = self._search_regex( r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<', @@ -48,6 +50,7 @@ class FranceInterIE(InfoExtractor): 'id': video_id, 'title': title, 'description': description, + 'thumbnail' : thumbnail, 'upload_date': upload_date, 'formats': [{ 'url': video_url, From 2cd43a00d1eec54c40e3fdf9a288f4cb391a8323 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 22 Nov 2020 19:38:45 +0100 Subject: [PATCH 87/87] [franceinter] flake8 --- youtube_dl/extractor/franceinter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index a009f4d38..ae822a50e 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -50,7 +50,7 @@ class FranceInterIE(InfoExtractor): 'id': video_id, 'title': title, 'description': description, - 'thumbnail' : thumbnail, + 'thumbnail': thumbnail, 'upload_date': upload_date, 'formats': [{ 'url': video_url,