diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index bf4251004..32c14aa85 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -21,15 +21,15 @@ ## Checklist - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dlc version **2020.10.26** +- [ ] I've verified that I'm running youtube-dlc version **2020.10.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -44,7 +44,7 @@ ## Verbose log [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dlc version 2020.10.26 + [debug] youtube-dlc version 2020.10.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 889005097..fe1aade05 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -21,15 +21,15 @@ ## Checklist - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dlcc version **2020.10.26** +- [ ] I've verified that I'm running youtube-dlcc version **2020.10.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index e5d714388..cddb81dda 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -21,13 +21,13 @@ ## Checklist - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dlc version **2020.10.26** +- [ ] I've verified that I'm running youtube-dlc version **2020.10.31** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 9de52f98c..920ae8dbc 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -21,16 +21,16 @@ ## Checklist - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dlc version **2020.10.26** +- [ ] I've verified that I'm running youtube-dlc version **2020.10.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -46,7 +46,7 @@ ## Verbose log [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dlc version 2020.10.26 + [debug] youtube-dlc version 2020.10.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 86fac96dd..7cc390f58 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -21,13 +21,13 @@ ## Checklist - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dlc version **2020.10.26** +- [ ] I've verified that I'm running youtube-dlc version **2020.10.31** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/6_question.md b/.github/ISSUE_TEMPLATE/6_question.md index 034a9c5ac..3c3ae0f3b 100644 --- a/.github/ISSUE_TEMPLATE/6_question.md +++ b/.github/ISSUE_TEMPLATE/6_question.md @@ -21,8 +21,8 @@ ## Checklist @@ -34,7 +34,7 @@ ## Checklist ## Question WRITE QUESTION HERE diff --git a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md index 8f9bb2c33..3fe4d6968 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md @@ -18,10 +18,10 @@ ## Checklist diff --git a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md index 9748afd4d..aad8fa054 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md @@ -19,10 +19,10 @@ ## Checklist diff --git a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md index f274e8aeb..2fb82f828 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md @@ -18,8 +18,8 @@ ## Checklist diff --git a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md index 788f1c9a1..b7bebf8ab 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md @@ -18,11 +18,11 @@ ## Checklist diff --git a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md index 9b3b8c3bf..99592f79d 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md @@ -19,8 +19,8 @@ ## Checklist diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8db7e92f2..dd6a95256 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,7 +20,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.x' + python-version: '3.8' - name: Install packages run: sudo apt-get -y install zip pandoc man - name: Bump version @@ -57,7 +57,7 @@ jobs: id: sha2_file env: SHA2: ${{ hashFiles('youtube-dlc') }} - run: echo "::set-output name=sha2_unix::${env:SHA2}" + run: echo "::set-output name=sha2_unix::$SHA2" - name: Install dependencies for pypi run: | python -m pip install --upgrade pip @@ -82,7 +82,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.x' + python-version: '3.8' - name: Install Requirements run: pip install pyinstaller - name: Bump version @@ -98,25 +98,25 @@ jobs: upload_url: ${{ needs.build_unix.outputs.upload_url }} asset_path: ./dist/youtube-dlc.exe asset_name: youtube-dlc.exe - asset_content_type: application/octet-stream + asset_content_type: application/vnd.microsoft.portable-executable - name: Get SHA2-256SUMS for youtube-dlc.exe id: sha2_file_win env: - SHA2: ${{ hashFiles('dist/youtube-dlc.exe') }} - run: echo "::set-output name=sha2_windows::${env:SHA2}" + SHA2_win: ${{ hashFiles('dist/youtube-dlc.exe') }} + run: echo "::set-output name=sha2_windows::$SHA2_win" build_windows32: runs-on: windows-latest - needs: build_unix + needs: [build_unix, build_windows] steps: - uses: actions/checkout@v2 - - name: Set up Python 3.5.4 32-Bit + - name: Set up Python 3.4.4 32-Bit uses: actions/setup-python@v2 with: - python-version: '3.5.4' + python-version: '3.4.4' architecture: 'x86' - name: Install Requirements for 32 Bit run: pip install pyinstaller==3.5 @@ -133,12 +133,12 @@ jobs: upload_url: ${{ needs.build_unix.outputs.upload_url }} asset_path: ./dist/youtube-dlc_x86.exe asset_name: youtube-dlc_x86.exe - asset_content_type: application/octet-stream + asset_content_type: application/vnd.microsoft.portable-executable - name: Get SHA2-256SUMS for youtube-dlc_x86.exe id: sha2_file_win32 env: - SHA2: ${{ hashFiles('dist/youtube-dlc_x86.exe') }} - run: echo "::set-output name=sha2_windows32::${env:SHA2}" + SHA2_win32: ${{ hashFiles('dist/youtube-dlc_x86.exe') }} + run: echo "::set-output name=sha2_windows32::$SHA2_win32" - name: Make SHA2-256SUMS file env: SHA2_WINDOWS: ${{ needs.build_windows.outputs.sha2_windows }} @@ -146,6 +146,18 @@ jobs: SHA2_UNIX: ${{ needs.build_unix.outputs.sha2_unix }} YTDLC_VERSION: ${{ needs.build_unix.outputs.ytdlc_version }} run: | - echo "$SHA2_WINDOWS youtube-dlc.exe" > SHA2-256SUMS - echo "$SHA2_WINDOWS32 youtube-dlc32.exe" > SHA2-256SUMS - echo "$SHA2_UNIX youtube-dlc" >> SHA2-256SUMS + echo "version:${env:YTDLC_VERSION}" >> SHA2-256SUMS + echo "youtube-dlc.exe:${env:SHA2_WINDOWS}" >> SHA2-256SUMS + echo "youtube-dlc_x86.exe:${env:SHA2_WINDOWS32}" >> SHA2-256SUMS + echo "youtube-dlc:${env:SHA2_UNIX}" >> SHA2-256SUMS + + - name: Upload 256SUMS file + id: upload-sums + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ needs.build_unix.outputs.upload_url }} + asset_path: ./SHA2-256SUMS + asset_name: SHA2-256SUMS + asset_content_type: text/plain diff --git a/README.md b/README.md index 9d40d2631..170c85c48 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,15 @@ -[![Build Status](https://travis-ci.com/blackjack4494/youtube-dlc.svg?branch=master)](https://travis-ci.com/blackjack4494/youtube-dlc) +[![Build Status](https://travis-ci.com/blackjack4494/yt-dlc.svg?branch=master)](https://travis-ci.com/blackjack4494/yt-dlc) [![PyPi](https://img.shields.io/pypi/v/youtube-dlc.svg)](https://pypi.org/project/youtube-dlc) -[![Downloads](https://pepy.tech/badge/youtube-dlc)](https://pepy.tech/project/youtube-dlc) [![Gitter chat](https://img.shields.io/gitter/room/youtube-dlc/community)](https://gitter.im/youtube-dlc) -[![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](https://github.com/blackjack4494/youtube-dlc/blob/master/LICENSE) +[![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](https://github.com/blackjack4494/yt-dlc/blob/master/LICENSE) youtube-dlc - download videos from youtube.com or other video platforms. -youtube-dlc is a fork of youtube-dl with the intention of getting features tested by the community merged in the tool faster, since youtube-dl's development seems to be slowing down. (https://github.com/ytdl-org/youtube-dl/issues/26462) +youtube-dlc is a fork of youtube-dl with the intention of getting features tested by the community merged in the tool faster, since youtube-dl's development seems to be slowing down. (https://web.archive.org/web/20201014194602/https://github.com/ytdl-org/youtube-dl/issues/26462) - [INSTALLATION](#installation) +- [UPDATE](#update) - [DESCRIPTION](#description) - [OPTIONS](#options) - [Network Options:](#network-options) @@ -44,6 +44,10 @@ # INSTALLATION python -m pip install --upgrade youtube-dlc +If you want to install the current master branch + + python -m pip install git+https://github.com/blackjack4494/yt-dlc + **UNIX** (Linux, macOS, etc.) Using wget: @@ -213,6 +217,8 @@ ## Video Selection: --download-archive FILE Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it. + --break-on-existing Stop the download process after attempting + to download a file that's in the archive. --include-ads Download advertisements as well (experimental) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index e6de72b33..c27ef9781 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -61,7 +61,7 @@ def build_lazy_ie(ie, name): return s -# find the correct sorting and add the required base classes so that sublcasses +# find the correct sorting and add the required base classes so that subclasses # can be correctly created classes = _ALL_CLASSES[:-1] ordered_cls = [] diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c46d122ff..0b183b272 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -59,9 +59,9 @@ # Supported sites - **ARD:mediathek** - **ARDBetaMediathek** - **Arkena** - - **arte.tv:+7** - - **arte.tv:embed** - - **arte.tv:playlist** + - **ArteTV** + - **ArteTVEmbed** + - **ArteTVPlaylist** - **AsianCrush** - **AsianCrushPlaylist** - **AtresPlayer** @@ -104,12 +104,14 @@ # Supported sites - **BIQLE** - **BitChute** - **BitChuteChannel** + - **bitwave.tv** - **BleacherReport** - **BleacherReportCMS** - **blinkx** - **Bloomberg** - **BokeCC** - **BostonGlobe** + - **Box** - **Bpb**: Bundeszentrale für politische Bildung - **BR**: Bayerischer Rundfunk - **BravoTV** @@ -157,6 +159,7 @@ # Supported sites - **Chilloutzone** - **chirbit** - **chirbit:profile** + - **cielotv.it** - **Cinchcast** - **Cinemax** - **CiscoLiveSearch** @@ -424,6 +427,7 @@ # Supported sites - **la7.it** - **laola1tv** - **laola1tv:embed** + - **lbry.tv** - **LCI** - **Lcp** - **LcpPlay** @@ -474,6 +478,7 @@ # Supported sites - **massengeschmack.tv** - **MatchTV** - **MDR**: MDR.DE and KiKA + - **MedalTV** - **media.ccc.de** - **media.ccc.de:lists** - **Medialaan** @@ -582,6 +587,7 @@ # Supported sites - **niconico**: ニコニコ動画 - **NiconicoPlaylist** - **Nintendo** + - **Nitter** - **njoy**: N-JOY - **njoy:embed** - **NJPWWorld**: 新日本プロレスワールド @@ -616,6 +622,7 @@ # Supported sites - **Nuvid** - **NYTimes** - **NYTimesArticle** + - **NYTimesCooking** - **NZZ** - **ocw.mit.edu** - **OdaTV** @@ -668,6 +675,8 @@ # Supported sites - **PicartoVod** - **Piksel** - **Pinkbike** + - **Pinterest** + - **PinterestCollection** - **Pladform** - **Platzi** - **PlatziCourse** @@ -764,6 +773,7 @@ # Supported sites - **RTVNH** - **RTVS** - **RUHD** + - **RumbleEmbed** - **rutube**: Rutube videos - **rutube:channel**: Rutube channels - **rutube:embed**: Rutube embedded videos @@ -834,12 +844,14 @@ # Supported sites - **SpankBangPlaylist** - **Spankwire** - **Spiegel** - - **Spiegel:Article**: Articles on spiegel.de - - **Spiegeltv** - **sport.francetvinfo.fr** - **Sport5** - **SportBox** - **SportDeutschland** + - **Spreaker** + - **SpreakerPage** + - **SpreakerShow** + - **SpreakerShowPage** - **SpringboardPlatform** - **Sprout** - **sr:mediathek**: Saarländischer Rundfunk @@ -943,6 +955,7 @@ # Supported sites - **TV2DKBornholmPlay** - **TV4**: tv4.se and tv4play.se - **TV5MondePlus**: TV5MONDE+ + - **tv8.it** - **TVA** - **TVANouvelles** - **TVANouvellesArticle** @@ -1057,7 +1070,7 @@ # Supported sites - **vk:wallpost** - **vlive** - **vlive:channel** - - **vlive:playlist** + - **vlive:post** - **Vodlocker** - **VODPl** - **VODPlatform** @@ -1146,20 +1159,17 @@ # Supported sites - **YourPorn** - **YourUpload** - **youtube**: YouTube.com - - **youtube:channel**: YouTube.com channels - - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) + - **youtube:favorites**: YouTube.com liked videos, ":ytfav" for short (requires authentication) - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) - - **youtube:live**: YouTube.com live streams - **youtube:playlist**: YouTube.com playlists - - **youtube:playlists**: YouTube.com user/channel playlists - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) - - **youtube:search**: YouTube.com searches - - **youtube:search:date**: YouTube.com searches, newest videos first + - **youtube:search**: YouTube.com searches, "ytsearch" keyword + - **youtube:search:date**: YouTube.com searches, newest videos first, "ytsearchdate" keyword - **youtube:search_url**: YouTube.com search URLs - - **youtube:show**: YouTube.com (multi-season) shows - - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) + - **youtube:subscriptions**: YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication) + - **youtube:tab**: YouTube.com tab - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **YoutubeYtUser**: YouTube.com user videos, URL or "ytuser" keyword - **Zapiks** - **Zaq1** - **Zattoo** diff --git a/make_win.bat b/make_win.bat index 891d517b3..c35d9937e 100644 --- a/make_win.bat +++ b/make_win.bat @@ -1 +1 @@ -py -m PyInstaller youtube_dlc\__main__.py --onefile --name youtube-dlc --version-file win\ver.txt --icon win\icon\cloud.ico \ No newline at end of file +py -m PyInstaller youtube_dlc\__main__.py --onefile --name youtube-dlc --version-file win\ver.txt --icon win\icon\cloud.ico --upx-exclude=vcruntime140.dll \ No newline at end of file diff --git a/setup.py b/setup.py index a10ef0a77..6908f2404 100644 --- a/setup.py +++ b/setup.py @@ -66,7 +66,7 @@ def run(self): description=DESCRIPTION, long_description=LONG_DESCRIPTION, # long_description_content_type="text/markdown", - url="https://github.com/blackjack4494/youtube-dlc", + url="https://github.com/blackjack4494/yt-dlc", packages=find_packages(exclude=("youtube_dl","test",)), #packages=[ # 'youtube_dlc', diff --git a/test/parameters.json b/test/parameters.json index 7bf59c25f..65fd54428 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -37,7 +37,7 @@ "writeinfojson": true, "writesubtitles": false, "allsubtitles": false, - "listssubtitles": false, + "listsubtitles": false, "socket_timeout": 20, "fixup": "never" } diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 6d02c2a54..a9e649191 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -919,6 +919,76 @@ def _real_extract(self, url): self.assertEqual(downloaded['extractor'], 'testex') self.assertEqual(downloaded['extractor_key'], 'TestEx') + # Test case for https://github.com/ytdl-org/youtube-dl/issues/27064 + def test_ignoreerrors_for_playlist_with_url_transparent_iterable_entries(self): + + class _YDL(YDL): + def __init__(self, *args, **kwargs): + super(_YDL, self).__init__(*args, **kwargs) + + def trouble(self, s, tb=None): + pass + + ydl = _YDL({ + 'format': 'extra', + 'ignoreerrors': True, + }) + + class VideoIE(InfoExtractor): + _VALID_URL = r'video:(?P\d+)' + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = [{ + 'format_id': 'default', + 'url': 'url:', + }] + if video_id == '0': + raise ExtractorError('foo') + if video_id == '2': + formats.append({ + 'format_id': 'extra', + 'url': TEST_URL, + }) + return { + 'id': video_id, + 'title': 'Video %s' % video_id, + 'formats': formats, + } + + class PlaylistIE(InfoExtractor): + _VALID_URL = r'playlist:' + + def _entries(self): + for n in range(3): + video_id = compat_str(n) + yield { + '_type': 'url_transparent', + 'ie_key': VideoIE.ie_key(), + 'id': video_id, + 'url': 'video:%s' % video_id, + 'title': 'Video Transparent %s' % video_id, + } + + def _real_extract(self, url): + return self.playlist_result(self._entries()) + + ydl.add_info_extractor(VideoIE(ydl)) + ydl.add_info_extractor(PlaylistIE(ydl)) + info = ydl.extract_info('playlist:') + entries = info['entries'] + self.assertEqual(len(entries), 3) + self.assertTrue(entries[0] is None) + self.assertTrue(entries[1] is None) + self.assertEqual(len(ydl.downloaded_info_dicts), 1) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(entries[2], downloaded) + self.assertEqual(downloaded['url'], TEST_URL) + self.assertEqual(downloaded['title'], 'Video Transparent 2') + self.assertEqual(downloaded['id'], '2') + self.assertEqual(downloaded['extractor'], 'Video') + self.assertEqual(downloaded['extractor_key'], 'Video') + if __name__ == '__main__': unittest.main() diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 548bc6750..8dcdc4e58 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -31,45 +31,47 @@ def assertMatch(self, url, ie_list): def test_youtube_playlist_matching(self): assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist']) + assertTab = lambda url: self.assertMatch(url, ['youtube:tab']) assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585 - assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') + assertPlaylist('PL63F0C78739B09958') + assertTab('https://www.youtube.com/AsapSCIENCE') + assertTab('https://www.youtube.com/embedded') + assertTab('https://www.youtube.com/feed') # Own channel's home page + assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') - assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') - assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 + assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') + assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M')) # Top tracks - assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101') + assertTab('https://www.youtube.com/playlist?list=MCUS.20142101') def test_youtube_matching(self): self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M')) self.assertFalse(YoutubeIE.suitable('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) # 668 self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube']) - self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) + # self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) # /v/ is no longer valid self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube']) self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube']) def test_youtube_channel_matching(self): - assertChannel = lambda url: self.assertMatch(url, ['youtube:channel']) + assertChannel = lambda url: self.assertMatch(url, ['youtube:tab']) assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM') assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec') assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos') - def test_youtube_user_matching(self): - self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:user']) + # def test_youtube_user_matching(self): + # self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab']) def test_youtube_feeds(self): - self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) - self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) - self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) - self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites']) + self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/history', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab']) - def test_youtube_show_matching(self): - self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show']) - - def test_youtube_search_matching(self): - self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) - self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) + # def test_youtube_search_matching(self): + # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) + # self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) def test_youtube_extract(self): assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) diff --git a/test/test_utils.py b/test/test_utils.py index 95231200b..16ad40831 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -937,6 +937,28 @@ def test_js_to_json_edgecases(self): self.assertEqual(d['x'], 1) self.assertEqual(d['y'], 'a') + # Just drop ! prefix for now though this results in a wrong value + on = js_to_json('''{ + a: !0, + b: !1, + c: !!0, + d: !!42.42, + e: !!![], + f: !"abc", + g: !"", + !42: 42 + }''') + self.assertEqual(json.loads(on), { + 'a': 0, + 'b': 1, + 'c': 0, + 'd': 42.42, + 'e': [], + 'f': "abc", + 'g': "", + '42': 42 + }) + on = js_to_json('["abc", "def",]') self.assertEqual(json.loads(on), ['abc', 'def']) @@ -994,6 +1016,12 @@ def test_js_to_json_edgecases(self): on = js_to_json('{42:4.2e1}') self.assertEqual(json.loads(on), {'42': 42.0}) + on = js_to_json('{ "0x40": "0x40" }') + self.assertEqual(json.loads(on), {'0x40': '0x40'}) + + on = js_to_json('{ "040": "040" }') + self.assertEqual(json.loads(on), {'040': '040'}) + def test_js_to_json_malformed(self): self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a-1'), '42"a"-1') diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index fc351db0d..ef6fe0a78 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -210,6 +210,8 @@ class YoutubeDL(object): download_archive: File name of a file where all downloads are recorded. Videos already present in the file are not downloaded again. + break_on_existing: Stop the download process after attempting to download a file that's + in the archive. cookiefile: File name where cookies should be read from and dumped to. nocheckcertificate:Do not verify SSL certificates prefer_insecure: Use HTTP instead of HTTPS to retrieve information. @@ -801,7 +803,7 @@ def add_extra_info(info_dict, extra_info): for key, value in extra_info.items(): info_dict.setdefault(key, value) - def extract_info(self, url, download=True, ie_key=None, extra_info={}, + def extract_info(self, url, download=True, ie_key=None, info_dict=None, extra_info={}, process=True, force_generic_extractor=False): ''' Returns a list with a dictionary for each video we find. @@ -821,26 +823,30 @@ def extract_info(self, url, download=True, ie_key=None, extra_info={}, if not ie.suitable(url): continue - ie = self.get_info_extractor(ie.ie_key()) + ie_key = ie.ie_key() + ie = self.get_info_extractor(ie_key) if not ie.working(): self.report_warning('The program functionality for this site has been marked as broken, ' 'and will probably not work.') try: - ie_result = ie.extract(url) - if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) - break - if isinstance(ie_result, list): - # Backwards compatibility: old IE result format - ie_result = { - '_type': 'compat_list', - 'entries': ie_result, - } - self.add_default_extra_info(ie_result, ie, url) - if process: - return self.process_ie_result(ie_result, download, extra_info) - else: - return ie_result + temp_id = ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) else ie._match_id(url) + except (AssertionError, IndexError, AttributeError): + temp_id = None + if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): + self.to_screen("[%s] %s: has already been recorded in archive" % ( + ie_key, temp_id)) + break + + return self.__extract_info(url, ie, download, extra_info, process, info_dict) + + else: + self.report_error('no suitable InfoExtractor for URL %s' % url) + + def __handle_extraction_exceptions(func): + def wrapper(self, *args, **kwargs): + try: + return func(self, *args, **kwargs) except GeoRestrictedError as e: msg = e.msg if e.countries: @@ -848,20 +854,38 @@ def extract_info(self, url, download=True, ie_key=None, extra_info={}, map(ISO3166Utils.short2full, e.countries)) msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' self.report_error(msg) - break except ExtractorError as e: # An error we somewhat expected self.report_error(compat_str(e), e.format_traceback()) - break except MaxDownloadsReached: raise except Exception as e: if self.params.get('ignoreerrors', False): self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) - break else: raise + return wrapper + + @__handle_extraction_exceptions + def __extract_info(self, url, ie, download, extra_info, process, info_dict): + ie_result = ie.extract(url) + if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) + return + if isinstance(ie_result, list): + # Backwards compatibility: old IE result format + ie_result = { + '_type': 'compat_list', + 'entries': ie_result, + } + if info_dict: + if info_dict.get('id'): + ie_result['id'] = info_dict['id'] + if info_dict.get('title'): + ie_result['title'] = info_dict['title'] + self.add_default_extra_info(ie_result, ie, url) + if process: + return self.process_ie_result(ie_result, download, extra_info) else: - self.report_error('no suitable InfoExtractor for URL %s' % url) + return ie_result def add_default_extra_info(self, ie_result, ie, url): self.add_extra_info(ie_result, { @@ -898,7 +922,7 @@ def process_ie_result(self, ie_result, download=True, extra_info={}): # We have to add extra_info to the results because it may be # contained in a playlist return self.extract_info(ie_result['url'], - download, + download, info_dict=ie_result, ie_key=ie_result.get('ie_key'), extra_info=extra_info) elif result_type == 'url_transparent': @@ -1033,12 +1057,15 @@ def report_download(num_entries): reason = self._match_entry(entry, incomplete=True) if reason is not None: - self.to_screen('[download] ' + reason) - continue + if reason.endswith('has already been recorded in the archive') and self.params.get('break_on_existing'): + print('[download] tried downloading a file that\'s already in the archive, stopping since --break-on-existing is set.') + break + else: + self.to_screen('[download] ' + reason) + continue - entry_result = self.process_ie_result(entry, - download=download, - extra_info=extra) + entry_result = self.__process_iterable_entry(entry, download, extra) + # TODO: skip failed (empty) entries? playlist_results.append(entry_result) ie_result['entries'] = playlist_results self.to_screen('[download] Finished downloading playlist: %s' % playlist) @@ -1067,6 +1094,11 @@ def _fixup(r): else: raise Exception('Invalid result type: %s' % result_type) + @__handle_extraction_exceptions + def __process_iterable_entry(self, entry, download, extra_info): + return self.process_ie_result( + entry, download=download, extra_info=extra_info) + def _build_format_filter(self, filter_spec): " Returns a function to filter the formats according to the filter_spec " @@ -1852,13 +1884,13 @@ def ensure_dir_exists(path): self.report_error('Cannot write annotations file: ' + annofn) return - def dl(name, info): + def dl(name, info, subtitle=False): fd = get_suitable_downloader(info, self.params)(self, self.params) for ph in self._progress_hooks: fd.add_progress_hook(ph) if self.params.get('verbose'): self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) - return fd.download(name, info) + return fd.download(name, info, subtitle) subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) @@ -1867,7 +1899,7 @@ def dl(name, info): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE subtitles = info_dict['requested_subtitles'] - ie = self.get_info_extractor(info_dict['extractor_key']) + # ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) @@ -1886,6 +1918,8 @@ def dl(name, info): return else: try: + dl(sub_filename, sub_info, subtitle=True) + ''' if self.params.get('sleep_interval_subtitles', False): dl(sub_filename, sub_info) else: @@ -1893,6 +1927,7 @@ def dl(name, info): sub_info['url'], info_dict['id'], note=False).read() with io.open(encodeFilename(sub_filename), 'wb') as subfile: subfile.write(sub_data) + ''' except (ExtractorError, IOError, OSError, ValueError, compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self.report_warning('Unable to download subtitle for "%s": %s' % (sub_lang, error_to_compat_str(err))) diff --git a/youtube_dlc/__init__.py b/youtube_dlc/__init__.py index 105786bc0..7d72ab985 100644 --- a/youtube_dlc/__init__.py +++ b/youtube_dlc/__init__.py @@ -405,6 +405,7 @@ def parse_retries(retries): 'youtube_print_sig_code': opts.youtube_print_sig_code, 'age_limit': opts.age_limit, 'download_archive': download_archive_fn, + 'break_on_existing': opts.break_on_existing, 'cookiefile': opts.cookiefile, 'nocheckcertificate': opts.no_check_certificate, 'prefer_insecure': opts.prefer_insecure, diff --git a/youtube_dlc/compat.py b/youtube_dlc/compat.py index 1cf7efed6..ac889ddd7 100644 --- a/youtube_dlc/compat.py +++ b/youtube_dlc/compat.py @@ -2345,7 +2345,7 @@ def __init__(self, version, name, value, *args, **kwargs): # HTMLParseError has been deprecated in Python 3.3 and removed in # Python 3.5. Introducing dummy exception for Python >3.5 for compatible - # and uniform cross-version exceptiong handling + # and uniform cross-version exception handling class compat_HTMLParseError(Exception): pass diff --git a/youtube_dlc/downloader/common.py b/youtube_dlc/downloader/common.py index 31c286458..7d303be1c 100644 --- a/youtube_dlc/downloader/common.py +++ b/youtube_dlc/downloader/common.py @@ -326,7 +326,7 @@ def report_unable_to_resume(self): """Report it was impossible to resume download.""" self.to_screen('[download] Unable to resume') - def download(self, filename, info_dict): + def download(self, filename, info_dict, subtitle=False): """Download to a filename using the info from info_dict Return True on success and False otherwise """ @@ -353,16 +353,25 @@ def download(self, filename, info_dict): }) return True - min_sleep_interval = self.params.get('sleep_interval') - if min_sleep_interval: - max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) - sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) - self.to_screen( - '[download] Sleeping %s seconds...' % ( - int(sleep_interval) if sleep_interval.is_integer() - else '%.2f' % sleep_interval)) - time.sleep(sleep_interval) - + if subtitle is False: + min_sleep_interval = self.params.get('sleep_interval') + if min_sleep_interval: + max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) + sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) + self.to_screen( + '[download] Sleeping %s seconds...' % ( + int(sleep_interval) if sleep_interval.is_integer() + else '%.2f' % sleep_interval)) + time.sleep(sleep_interval) + else: + sleep_interval_sub = 0 + if type(self.params.get('sleep_interval_subtitles')) is int: + sleep_interval_sub = self.params.get('sleep_interval_subtitles') + if sleep_interval_sub > 0: + self.to_screen( + '[download] Sleeping %s seconds...' % ( + sleep_interval_sub)) + time.sleep(sleep_interval_sub) return self.real_download(filename, info_dict) def real_download(self, filename, info_dict): diff --git a/youtube_dlc/downloader/external.py b/youtube_dlc/downloader/external.py index c31f8910a..d2f8f271d 100644 --- a/youtube_dlc/downloader/external.py +++ b/youtube_dlc/downloader/external.py @@ -115,8 +115,10 @@ class CurlFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '--location', '-o', tmpfilename] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') cmd += self._valueless_option('--silent', 'noprogress') cmd += self._valueless_option('--verbose', 'verbose') @@ -150,8 +152,9 @@ class AxelFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-o', tmpfilename] - for key, val in info_dict['http_headers'].items(): - cmd += ['-H', '%s: %s' % (key, val)] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['-H', '%s: %s' % (key, val)] cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -162,8 +165,9 @@ class WgetFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] cmd += self._option('--limit-rate', 'ratelimit') retry = self._option('--tries', 'retries') if len(retry) == 2: @@ -189,8 +193,9 @@ def _make_cmd(self, tmpfilename, info_dict): if dn: cmd += ['--dir', dn] cmd += ['--out', os.path.basename(tmpfilename)] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] cmd += self._option('--interface', 'source_address') cmd += self._option('--all-proxy', 'proxy') cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') @@ -206,8 +211,10 @@ def available(cls): def _make_cmd(self, tmpfilename, info_dict): cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] - for key, val in info_dict['http_headers'].items(): - cmd += ['%s:%s' % (key, val)] + + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['%s:%s' % (key, val)] return cmd @@ -253,7 +260,7 @@ def _call_downloader(self, tmpfilename, info_dict): # if end_time: # args += ['-t', compat_str(end_time - start_time)] - if info_dict['http_headers'] and re.match(r'^https?://', url): + if info_dict.get('http_headers') is not None and re.match(r'^https?://', url): # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. headers = handle_youtubedl_headers(info_dict['http_headers']) diff --git a/youtube_dlc/downloader/fragment.py b/youtube_dlc/downloader/fragment.py index 9339b3a62..cf4fd41da 100644 --- a/youtube_dlc/downloader/fragment.py +++ b/youtube_dlc/downloader/fragment.py @@ -97,12 +97,15 @@ def _write_ytdl_file(self, ctx): def _download_fragment(self, ctx, frag_url, info_dict, headers=None): fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) - success = ctx['dl'].download(fragment_filename, { + fragment_info_dict = { 'url': frag_url, 'http_headers': headers or info_dict.get('http_headers'), - }) + } + success = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False, None + if fragment_info_dict.get('filetime'): + ctx['fragment_filetime'] = fragment_info_dict.get('filetime') down, frag_sanitized = sanitize_open(fragment_filename, 'rb') ctx['fragment_filename_sanitized'] = frag_sanitized frag_content = down.read() @@ -258,6 +261,13 @@ def _finish_frag_download(self, ctx): downloaded_bytes = ctx['complete_frags_downloaded_bytes'] else: self.try_rename(ctx['tmpfilename'], ctx['filename']) + if self.params.get('updatetime', True): + filetime = ctx.get('fragment_filetime') + if filetime: + try: + os.utime(ctx['filename'], (time.time(), filetime)) + except Exception: + pass downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) self._hook_progress({ diff --git a/youtube_dlc/downloader/http.py b/youtube_dlc/downloader/http.py index 96379caf1..d8ac41dcc 100644 --- a/youtube_dlc/downloader/http.py +++ b/youtube_dlc/downloader/http.py @@ -109,7 +109,9 @@ def establish_connection(): try: ctx.data = self.ydl.urlopen(request) except (compat_urllib_error.URLError, ) as err: - if isinstance(err.reason, socket.timeout): + # reason may not be available, e.g. for urllib2.HTTPError on python 2.6 + reason = getattr(err, 'reason', None) + if isinstance(reason, socket.timeout): raise RetryDownload(err) raise err # When trying to resume, Content-Range HTTP header of response has to be checked diff --git a/youtube_dlc/downloader/youtube_live_chat.py b/youtube_dlc/downloader/youtube_live_chat.py index 4932dd9c5..b333afa5b 100644 --- a/youtube_dlc/downloader/youtube_live_chat.py +++ b/youtube_dlc/downloader/youtube_live_chat.py @@ -82,7 +82,10 @@ def parse_yt_initial_data(data): offset = int(replay_chat_item_action['videoOffsetTimeMsec']) processed_fragment.extend( json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n') - continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] + try: + continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] + except KeyError: + continuation_id = None self._append_fragment(ctx, processed_fragment) diff --git a/youtube_dlc/extractor/adobepass.py b/youtube_dlc/extractor/adobepass.py index 38dca1b0a..649f9940f 100644 --- a/youtube_dlc/extractor/adobepass.py +++ b/youtube_dlc/extractor/adobepass.py @@ -1438,6 +1438,13 @@ def extract_redirect_url(html, url=None, fatal=False): provider_redirect_page, 'oauth redirect') self._download_webpage( oauth_redirect_url, video_id, 'Confirming auto login') + elif 'automatically signed in with' in provider_redirect_page: + # Seems like comcast is rolling up new way of automatically signing customers + oauth_redirect_url = self._html_search_regex( + r'continue:\s*"(https://oauth.xfinity.com/oauth/authorize\?.+)"', provider_redirect_page, + 'oauth redirect (signed)') + # Just need to process the request. No useful data comes back + self._download_webpage(oauth_redirect_url, video_id, 'Confirming auto login') else: if '
\w+)' + _TESTS = [{ + # Youtube + 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', + 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', + 'info_dict': { + 'id': 'h6ZuVdvYnfE', + 'ext': 'mp4', + 'title': 'Why jury trials are becoming less common', + 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20160813', + 'uploader': 'PBS NewsHour', + 'uploader_id': 'PBSNewsHour', + 'timestamp': 1549639570, + } + }, { + # Vimeo + 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', + 'md5': '99392c75fa05d432a8f11df03612195e', + 'info_dict': { + 'id': '18622084', + 'ext': 'mov', + 'title': 'Vimeo at CES 2011!', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'timestamp': 1294763658, + 'upload_date': '20110111', + 'uploader': 'Sam Morrill', + 'uploader_id': 'sammorrill' + } + }, { + # Direct Link + 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', + 'md5': 'd3970f08512738ee60c5807311ff5d3f', + 'info_dict': { + 'id': 's8KL7I3jLmh6', + 'ext': 'mp4', + 'title': 'The danger of a single story', + 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20091007', + 'timestamp': 1254942511, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._download_json( + 'https://amara.org/api/videos/%s/' % video_id, + video_id, query={'format': 'json'}) + title = meta['title'] + video_url = meta['all_urls'][0] + + subtitles = {} + for language in (meta.get('languages') or []): + subtitles_uri = language.get('subtitles_uri') + if not (subtitles_uri and language.get('published')): + continue + subtitle = subtitles.setdefault(language.get('code') or 'en', []) + for f in ('json', 'srt', 'vtt'): + subtitle.append({ + 'ext': f, + 'url': update_url_query(subtitles_uri, {'format': f}), + }) + + info = { + 'url': video_url, + 'id': video_id, + 'subtitles': subtitles, + 'title': title, + 'description': meta.get('description'), + 'thumbnail': meta.get('thumbnail'), + 'duration': int_or_none(meta.get('duration')), + 'timestamp': parse_iso8601(meta.get('created')), + } + + for ie in (YoutubeIE, VimeoIE): + if ie.suitable(video_url): + info.update({ + '_type': 'url_transparent', + 'ie_key': ie.ie_key(), + }) + break + + return info diff --git a/youtube_dlc/extractor/arte.py b/youtube_dlc/extractor/arte.py index 2bd3bfe8a..03abdbfaf 100644 --- a/youtube_dlc/extractor/arte.py +++ b/youtube_dlc/extractor/arte.py @@ -4,23 +4,57 @@ import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( ExtractorError, int_or_none, qualities, try_get, unified_strdate, + url_or_none, ) -# There are different sources of video in arte.tv, the extraction process -# is different for each one. The videos usually expire in 7 days, so we can't -# add tests. - class ArteTVBaseIE(InfoExtractor): - def _extract_from_json_url(self, json_url, video_id, lang, title=None): - info = self._download_json(json_url, video_id) + _ARTE_LANGUAGES = 'fr|de|en|es|it|pl' + _API_BASE = 'https://api.arte.tv/api/player/v1' + + +class ArteTVIE(ArteTVBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?arte\.tv/(?P%(langs)s)/videos| + api\.arte\.tv/api/player/v\d+/config/(?P%(langs)s) + ) + /(?P\d{6}-\d{3}-[AF]) + ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES} + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'info_dict': { + 'id': '088501-000-A', + 'ext': 'mp4', + 'title': 'Mexico: Stealing Petrol to Survive', + 'upload_date': '20190628', + }, + }, { + 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', + 'only_matching': True, + }, { + 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + lang = mobj.group('lang') or mobj.group('lang_2') + + info = self._download_json( + '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id) player_info = info['videoJsonPlayer'] vsr = try_get(player_info, lambda x: x['VSR'], dict) @@ -37,18 +71,11 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None): if not upload_date_str: upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] - title = (player_info.get('VTI') or title or player_info['VID']).strip() + title = (player_info.get('VTI') or player_info['VID']).strip() subtitle = player_info.get('VSU', '').strip() if subtitle: title += ' - %s' % subtitle - info_dict = { - 'id': player_info['VID'], - 'title': title, - 'description': player_info.get('VDE'), - 'upload_date': unified_strdate(upload_date_str), - 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), - } qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) LANGS = { @@ -65,6 +92,10 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None): formats = [] for format_id, format_dict in vsr.items(): f = dict(format_dict) + format_url = url_or_none(f.get('url')) + streamer = f.get('streamer') + if not format_url and not streamer: + continue versionCode = f.get('versionCode') l = re.escape(langcode) @@ -107,6 +138,16 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None): else: lang_pref = -1 + media_type = f.get('mediaType') + if media_type == 'hls': + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + for m3u8_format in m3u8_formats: + m3u8_format['language_preference'] = lang_pref + formats.extend(m3u8_formats) + continue + format = { 'format_id': format_id, 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, @@ -118,7 +159,7 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None): 'quality': qfunc(f.get('quality')), } - if f.get('mediaType') == 'rtmp': + if media_type == 'rtmp': format['url'] = f['streamer'] format['play_path'] = 'mp4:' + f['url'] format['ext'] = 'flv' @@ -127,56 +168,50 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None): formats.append(format) - self._check_formats(formats, video_id) self._sort_formats(formats) - info_dict['formats'] = formats - return info_dict + return { + 'id': player_info.get('VID') or video_id, + 'title': title, + 'description': player_info.get('VDE'), + 'upload_date': unified_strdate(upload_date_str), + 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), + 'formats': formats, + } -class ArteTVPlus7IE(ArteTVBaseIE): - IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?Pfr|de|en|es|it|pl)/videos/(?P\d{6}-\d{3}-[AF])' - +class ArteTVEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' _TESTS = [{ - 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', 'info_dict': { - 'id': '088501-000-A', + 'id': '100605-013-A', 'ext': 'mp4', - 'title': 'Mexico: Stealing Petrol to Survive', - 'upload_date': '20190628', + 'title': 'United we Stream November Lockdown Edition #13', + 'description': 'md5:be40b667f45189632b78c1425c7c2ce1', + 'upload_date': '20201116', }, + }, { + 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', + 'only_matching': True, }] - def _real_extract(self, url): - lang, video_id = re.match(self._VALID_URL, url).groups() - return self._extract_from_json_url( - 'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id), - video_id, lang) - - -class ArteTVEmbedIE(ArteTVPlus7IE): - IE_NAME = 'arte.tv:embed' - _VALID_URL = r'''(?x) - https://www\.arte\.tv - /player/v3/index\.php\?json_url= - (?P - https?://api\.arte\.tv/api/player/v1/config/ - (?P[^/]+)/(?P\d{6}-\d{3}-[AF]) - ) - ''' - - _TESTS = [] + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<(?:iframe|script)[^>]+src=(["\'])(?P(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1', + webpage)] def _real_extract(self, url): - json_url, lang, video_id = re.match(self._VALID_URL, url).groups() - return self._extract_from_json_url(json_url, video_id, lang) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + json_url = qs['json_url'][0] + video_id = ArteTVIE._match_id(json_url) + return self.url_result( + json_url, ie=ArteTVIE.ie_key(), video_id=video_id) class ArteTVPlaylistIE(ArteTVBaseIE): - IE_NAME = 'arte.tv:playlist' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?Pfr|de|en|es|it|pl)/videos/(?PRC-\d{6})' - + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P%s)/videos/(?PRC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', 'info_dict': { @@ -185,17 +220,35 @@ class ArteTVPlaylistIE(ArteTVBaseIE): 'description': 'md5:d322c55011514b3a7241f7fb80d494c2', }, 'playlist_mincount': 6, + }, { + 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/', + 'only_matching': True, }] def _real_extract(self, url): lang, playlist_id = re.match(self._VALID_URL, url).groups() collection = self._download_json( - 'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' - % (lang, playlist_id), playlist_id) + '%s/collectionData/%s/%s?source=videos' + % (self._API_BASE, lang, playlist_id), playlist_id) + entries = [] + for video in collection['videos']: + if not isinstance(video, dict): + continue + video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl')) + if not video_url: + continue + video_id = video.get('programId') + entries.append({ + '_type': 'url_transparent', + 'url': video_url, + 'id': video_id, + 'title': video.get('title'), + 'alt_title': video.get('subtitle'), + 'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)), + 'duration': int_or_none(video.get('durationSeconds')), + 'view_count': int_or_none(video.get('views')), + 'ie_key': ArteTVIE.ie_key(), + }) title = collection.get('title') description = collection.get('shortDescription') or collection.get('teaserText') - entries = [ - self._extract_from_json_url( - video['jsonUrl'], video.get('programId') or playlist_id, lang) - for video in collection['videos'] if video.get('jsonUrl')] return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dlc/extractor/bandcamp.py b/youtube_dlc/extractor/bandcamp.py index 9dbafe86d..69e673a26 100644 --- a/youtube_dlc/extractor/bandcamp.py +++ b/youtube_dlc/extractor/bandcamp.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import random @@ -5,10 +6,7 @@ import time from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, float_or_none, @@ -17,33 +15,32 @@ parse_filesize, str_or_none, try_get, - unescapeHTML, update_url_query, unified_strdate, unified_timestamp, url_or_none, + urljoin, ) class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P[^/?#&]+)' + _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'http://youtube-dlc.bandcamp.com/track/youtube-dlc-test-song', + 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭", 'duration': 9.8485, - 'uploader': "youtube-dl \"'/\\\u00e4\u21ad", - 'timestamp': 1354224127, + 'uploader': 'youtube-dl "\'/\\ä↭', 'upload_date': '20121129', + 'timestamp': 1354224127, }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { # free download 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '5d92af55811e47f38962a54c30b07ef0', 'info_dict': { 'id': '2650410135', 'ext': 'aiff', @@ -82,11 +79,16 @@ class BandcampIE(InfoExtractor): }, }] + def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): + return self._parse_json(self._html_search_regex( + r'data-%s=(["\'])({.+?})\1' % attr, webpage, + attr + ' data', group=2), video_id, fatal=fatal) + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') + title = self._match_id(url) webpage = self._download_webpage(url, title) - thumbnail = self._html_search_meta('og:image', webpage, default=None) + tralbum = self._extract_data_attr(webpage, title) + thumbnail = self._og_search_thumbnail(webpage) track_id = None track = None @@ -94,11 +96,7 @@ def _real_extract(self, url): duration = None formats = [] - trackinfo_block = self._html_search_regex( - r'trackinfo(?:["\']|"):\[\s*({.+?})\s*\],(?:["\']|")', - webpage, 'track info', default='{}') - - track_info = self._parse_json(trackinfo_block, title) + track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict) if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -114,40 +112,26 @@ def _real_extract(self, url): 'acodec': ext, 'abr': int_or_none(abr_str), }) - - track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) + track = track_info.get('title') + track_id = str_or_none( + track_info.get('track_id') or track_info.get('id')) track_number = int_or_none(track_info.get('track_num')) duration = float_or_none(track_info.get('duration')) - def extract(key): - data = self._html_search_regex( - r',(["\']|")%s\1:\1(?P<value>(?:\\\1|((?!\1).))+)\1' % key, - webpage, key, default=None, group='value') - return data.replace(r'\"', '"').replace('\\\\', '\\') if data else data - - track = extract('title') - artist = extract('artist') - album = extract('album_title') + embed = self._extract_data_attr(webpage, title, 'embed', False) + current = tralbum.get('current') or {} + artist = embed.get('artist') or current.get('artist') or tralbum.get('artist') timestamp = unified_timestamp( - extract('publish_date') or extract('album_publish_date')) - release_date = unified_strdate(extract('album_release_date')) + current.get('publish_date') or tralbum.get('album_publish_date')) - download_link = self._search_regex( - r'freeDownloadPage(?:["\']|"):\s*(["\']|")(?P<url>(?:(?!\1).)+)\1', webpage, - 'download link', default=None, group='url') + download_link = tralbum.get('freeDownloadPage') if download_link: - track_id = self._search_regex( - r'\?id=(?P<id>\d+)&', - download_link, 'track id') + track_id = compat_str(tralbum['id']) download_webpage = self._download_webpage( download_link, track_id, 'Downloading free downloads page') - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, - 'blob', group='blob'), - track_id, transform_source=unescapeHTML) + blob = self._extract_data_attr(download_webpage, track_id, 'blob') info = try_get( blob, (lambda x: x['digital_items'][0], @@ -213,20 +197,20 @@ def extract(key): 'thumbnail': thumbnail, 'uploader': artist, 'timestamp': timestamp, - 'release_date': release_date, + 'release_date': unified_strdate(tralbum.get('album_release_date')), 'duration': duration, 'track': track, 'track_number': track_number, 'track_id': track_id, 'artist': artist, - 'album': album, + 'album': embed.get('album_title'), 'formats': formats, } -class BandcampAlbumIE(InfoExtractor): +class BandcampAlbumIE(BandcampIE): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?' _TESTS = [{ 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -236,7 +220,10 @@ class BandcampAlbumIE(InfoExtractor): 'info_dict': { 'id': '1353101989', 'ext': 'mp3', - 'title': 'Intro', + 'title': 'Blazo - Intro', + 'timestamp': 1311756226, + 'upload_date': '20110727', + 'uploader': 'Blazo', } }, { @@ -244,7 +231,10 @@ class BandcampAlbumIE(InfoExtractor): 'info_dict': { 'id': '38097443', 'ext': 'mp3', - 'title': 'Kero One - Keep It Alive (Blazo remix)', + 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)', + 'timestamp': 1311757238, + 'upload_date': '20110727', + 'uploader': 'Blazo', } }, ], @@ -280,6 +270,7 @@ class BandcampAlbumIE(InfoExtractor): 'title': '"Entropy" EP', 'uploader_id': 'jstrecords', 'id': 'entropy-ep', + 'description': 'md5:0ff22959c943622972596062f2f366a5', }, 'playlist_mincount': 3, }, { @@ -289,6 +280,7 @@ class BandcampAlbumIE(InfoExtractor): 'id': 'we-are-the-plague', 'title': 'WE ARE THE PLAGUE', 'uploader_id': 'insulters', + 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f', }, 'playlist_count': 2, }] @@ -300,43 +292,34 @@ def suitable(cls, url): else super(BandcampAlbumIE, cls).suitable(url)) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - uploader_id = mobj.group('subdomain') - album_id = mobj.group('album_id') + uploader_id, album_id = re.match(self._VALID_URL, url).groups() playlist_id = album_id or uploader_id webpage = self._download_webpage(url, playlist_id) - track_elements = re.findall( - r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage) - if not track_elements: + tralbum = self._extract_data_attr(webpage, playlist_id) + track_info = tralbum.get('trackinfo') + if not track_info: raise ExtractorError('The page doesn\'t contain any tracks') # Only tracks with duration info have songs entries = [ self.url_result( - compat_urlparse.urljoin(url, t_path), - ie=BandcampIE.ie_key(), - video_title=self._search_regex( - r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', - elem_content, 'track title', fatal=False)) - for elem_content, t_path in track_elements - if self._html_search_meta('duration', elem_content, default=None)] + urljoin(url, t['title_link']), BandcampIE.ie_key(), + str_or_none(t.get('track_id') or t.get('id')), t.get('title')) + for t in track_info + if t.get('duration')] - title = self._html_search_regex( - r'album_title\s*(?:"|["\']):\s*("|["\'])(?P<album>(?:\\\1|((?!\1).))+)\1', - webpage, 'title', fatal=False, group='album') - - if title: - title = title.replace(r'\"', '"') + current = tralbum.get('current') or {} return { '_type': 'playlist', 'uploader_id': uploader_id, 'id': playlist_id, - 'title': title, + 'title': current.get('title'), + 'description': current.get('about'), 'entries': entries, } -class BandcampWeeklyIE(InfoExtractor): +class BandcampWeeklyIE(BandcampIE): IE_NAME = 'Bandcamp:weekly' _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' _TESTS = [{ @@ -351,29 +334,23 @@ class BandcampWeeklyIE(InfoExtractor): 'release_date': '20170404', 'series': 'Bandcamp Weekly', 'episode': 'Magic Moments', - 'episode_number': 208, 'episode_id': '224', - } + }, + 'params': { + 'format': 'opus-lo', + }, }, { 'url': 'https://bandcamp.com/?blah/blah@&show=228', 'only_matching': True }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage, - 'blob', group='blob'), - video_id, transform_source=unescapeHTML) + blob = self._extract_data_attr(webpage, show_id, 'blob') - show = blob['bcw_show'] - - # This is desired because any invalid show id redirects to `bandcamp.com` - # which happens to expose the latest Bandcamp Weekly episode. - show_id = int_or_none(show.get('show_id')) or int_or_none(video_id) + show = blob['bcw_data'][show_id] formats = [] for format_id, format_url in show['audio_stream'].items(): @@ -398,20 +375,8 @@ def _real_extract(self, url): if subtitle: title += ' - %s' % subtitle - episode_number = None - seq = blob.get('bcw_seq') - - if seq and isinstance(seq, list): - try: - episode_number = next( - int_or_none(e.get('episode_number')) - for e in seq - if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) - except StopIteration: - pass - return { - 'id': video_id, + 'id': show_id, 'title': title, 'description': show.get('desc') or show.get('short_desc'), 'duration': float_or_none(show.get('audio_duration')), @@ -419,7 +384,6 @@ def _real_extract(self, url): 'release_date': unified_strdate(show.get('published_date')), 'series': 'Bandcamp Weekly', 'episode': show.get('subtitle'), - 'episode_number': episode_number, - 'episode_id': compat_str(video_id), + 'episode_id': show_id, 'formats': formats } diff --git a/youtube_dlc/extractor/bbc.py b/youtube_dlc/extractor/bbc.py index 002c39c39..54cbcdc8e 100644 --- a/youtube_dlc/extractor/bbc.py +++ b/youtube_dlc/extractor/bbc.py @@ -981,7 +981,7 @@ def _real_extract(self, url): group_id = self._search_regex( r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, webpage, 'group id', default=None) - if playlist_id: + if group_id: return self.url_result( 'https://www.bbc.co.uk/programmes/%s' % group_id, ie=BBCCoUkIE.ie_key()) @@ -1092,10 +1092,26 @@ def _real_extract(self, url): self._search_regex( r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, 'bbcthree config', default='{}'), - playlist_id, transform_source=js_to_json, fatal=False) - if bbc3_config: + playlist_id, transform_source=js_to_json, fatal=False) or {} + payload = bbc3_config.get('payload') or {} + if payload: + clip = payload.get('currentClip') or {} + clip_vpid = clip.get('vpid') + clip_title = clip.get('title') + if clip_vpid and clip_title: + formats, subtitles = self._download_media_selector(clip_vpid) + self._sort_formats(formats) + return { + 'id': clip_vpid, + 'title': clip_title, + 'thumbnail': dict_get(clip, ('poster', 'imageUrl')), + 'description': clip.get('description'), + 'duration': parse_duration(clip.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + } bbc3_playlist = try_get( - bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], + payload, lambda x: x['content']['bbcMedia']['playlist'], dict) if bbc3_playlist: playlist_title = bbc3_playlist.get('title') or playlist_title @@ -1118,6 +1134,39 @@ def _real_extract(self, url): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + initial_data = self._parse_json(self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), playlist_id, fatal=False) + if initial_data: + def parse_media(media): + if not media: + return + for item in (try_get(media, lambda x: x['media']['items'], list) or []): + item_id = item.get('id') + item_title = item.get('title') + if not (item_id and item_title): + continue + formats, subtitles = self._download_media_selector(item_id) + self._sort_formats(formats) + entries.append({ + 'id': item_id, + 'title': item_title, + 'thumbnail': item.get('holdingImageUrl'), + 'formats': formats, + 'subtitles': subtitles, + }) + for resp in (initial_data.get('data') or {}).values(): + name = resp.get('name') + if name == 'media-experience': + parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) + elif name == 'article': + for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): + if block.get('type') != 'media': + continue + parse_media(block.get('model')) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), diff --git a/youtube_dlc/extractor/bitchute.py b/youtube_dlc/extractor/bitchute.py index 92fc70b5a..94219a138 100644 --- a/youtube_dlc/extractor/bitchute.py +++ b/youtube_dlc/extractor/bitchute.py @@ -36,6 +36,14 @@ class BitChuteIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>%s)' % BitChuteIE._VALID_URL, + webpage)] + def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dlc/extractor/bitwave.py b/youtube_dlc/extractor/bitwave.py new file mode 100644 index 000000000..eb16c469d --- /dev/null +++ b/youtube_dlc/extractor/bitwave.py @@ -0,0 +1,61 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BitwaveReplayIE(InfoExtractor): + IE_NAME = 'bitwave:replay' + _VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P<user>\w+)/replay/(?P<id>\w+)/?$' + _TEST = { + 'url': 'https://bitwave.tv/RhythmicCarnage/replay/z4P6eq5L7WDrM85UCrVr', + 'only_matching': True + } + + def _real_extract(self, url): + replay_id = self._match_id(url) + replay = self._download_json( + 'https://api.bitwave.tv/v1/replays/' + replay_id, + replay_id + ) + + return { + 'id': replay_id, + 'title': replay['data']['title'], + 'uploader': replay['data']['name'], + 'uploader_id': replay['data']['name'], + 'url': replay['data']['url'], + 'thumbnails': [ + {'url': x} for x in replay['data']['thumbnails'] + ], + } + + +class BitwaveStreamIE(InfoExtractor): + IE_NAME = 'bitwave:stream' + _VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P<id>\w+)/?$' + _TEST = { + 'url': 'https://bitwave.tv/doomtube', + 'only_matching': True + } + + def _real_extract(self, url): + username = self._match_id(url) + channel = self._download_json( + 'https://api.bitwave.tv/v1/channels/' + username, + username) + + formats = self._extract_m3u8_formats( + channel['data']['url'], username, + 'mp4') + self._sort_formats(formats) + + return { + 'id': username, + 'title': self._live_title(channel['data']['title']), + 'uploader': username, + 'uploader_id': username, + 'formats': formats, + 'thumbnail': channel['data']['thumbnail'], + 'is_live': True, + 'view_count': channel['data']['viewCount'] + } diff --git a/youtube_dlc/extractor/box.py b/youtube_dlc/extractor/box.py new file mode 100644 index 000000000..aae82d1af --- /dev/null +++ b/youtube_dlc/extractor/box.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + parse_iso8601, + # try_get, + update_url_query, +) + + +class BoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/]+)/file/(?P<id>\d+)' + _TEST = { + 'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538', + 'md5': '1f81b2fd3960f38a40a3b8823e5fcd43', + 'info_dict': { + 'id': '510727257538', + 'ext': 'mp4', + 'title': 'Garber St. Louis will be 28th MLS team +scarving.mp4', + 'uploader': 'MLS Video', + 'timestamp': 1566320259, + 'upload_date': '20190820', + 'uploader_id': '235196876', + } + } + + def _real_extract(self, url): + shared_name, file_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, file_id) + request_token = self._parse_json(self._search_regex( + r'Box\.config\s*=\s*({.+?});', webpage, + 'Box config'), file_id)['requestToken'] + access_token = self._download_json( + 'https://app.box.com/app-api/enduserapp/elements/tokens', file_id, + 'Downloading token JSON metadata', + data=json.dumps({'fileIDs': [file_id]}).encode(), headers={ + 'Content-Type': 'application/json', + 'X-Request-Token': request_token, + 'X-Box-EndUser-API': 'sharedName=' + shared_name, + })[file_id]['read'] + shared_link = 'https://app.box.com/s/' + shared_name + f = self._download_json( + 'https://api.box.com/2.0/files/' + file_id, file_id, + 'Downloading file JSON metadata', headers={ + 'Authorization': 'Bearer ' + access_token, + 'BoxApi': 'shared_link=' + shared_link, + 'X-Rep-Hints': '[dash]', # TODO: extract `hls` formats + }, query={ + 'fields': 'authenticated_download_url,created_at,created_by,description,extension,is_download_available,name,representations,size' + }) + title = f['name'] + + query = { + 'access_token': access_token, + 'shared_link': shared_link + } + + formats = [] + + # for entry in (try_get(f, lambda x: x['representations']['entries'], list) or []): + # entry_url_template = try_get( + # entry, lambda x: x['content']['url_template']) + # if not entry_url_template: + # continue + # representation = entry.get('representation') + # if representation == 'dash': + # TODO: append query to every fragment URL + # formats.extend(self._extract_mpd_formats( + # entry_url_template.replace('{+asset_path}', 'manifest.mpd'), + # file_id, query=query)) + + authenticated_download_url = f.get('authenticated_download_url') + if authenticated_download_url and f.get('is_download_available'): + formats.append({ + 'ext': f.get('extension') or determine_ext(title), + 'filesize': f.get('size'), + 'format_id': 'download', + 'url': update_url_query(authenticated_download_url, query), + }) + + self._sort_formats(formats) + + creator = f.get('created_by') or {} + + return { + 'id': file_id, + 'title': title, + 'formats': formats, + 'description': f.get('description') or None, + 'uploader': creator.get('name'), + 'timestamp': parse_iso8601(f.get('created_at')), + 'uploader_id': creator.get('id'), + } diff --git a/youtube_dlc/extractor/brightcove.py b/youtube_dlc/extractor/brightcove.py index 2aa9f4782..c6ca939dd 100644 --- a/youtube_dlc/extractor/brightcove.py +++ b/youtube_dlc/extractor/brightcove.py @@ -147,7 +147,7 @@ class BrightcoveLegacyIE(InfoExtractor): ] @classmethod - def _build_brighcove_url(cls, object_str): + def _build_brightcove_url(cls, object_str): """ Build a Brightcove url from a xml string containing <object class="BrightcoveExperience">{params}</object> @@ -217,7 +217,7 @@ def find_param(name): return cls._make_brightcove_url(params) @classmethod - def _build_brighcove_url_from_js(cls, object_js): + def _build_brightcove_url_from_js(cls, object_js): # The layout of JS is as follows: # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { # // build Brightcove <object /> XML @@ -272,12 +272,12 @@ def _extract_brightcove_urls(cls, webpage): ).+?>\s*</object>''', webpage) if matches: - return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + return list(filter(None, [cls._build_brightcove_url(m) for m in matches])) matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage) if matches: return list(filter(None, [ - cls._build_brighcove_url_from_js(custom_bc) + cls._build_brightcove_url_from_js(custom_bc) for custom_bc in matches])) return [src for _, src in re.findall( r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)] @@ -471,12 +471,17 @@ def _parse_brightcove_metadata(self, json_data, video_id, headers={}): title = json_data['name'].strip() formats = [] + sources_num = len(json_data.get('sources')) + key_systems_present = 0 for source in json_data.get('sources', []): container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') - # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object - if ext == 'ism' or container == 'WVM' or source.get('key_systems'): + # https://apis.support.brightcove.com/playback/references/playback-api-video-fields-reference.html + if source.get('key_systems'): + key_systems_present += 1 + continue + elif ext == 'ism' or container == 'WVM': continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -533,6 +538,10 @@ def build_format_id(kind): 'format_id': build_format_id('rtmp'), }) formats.append(f) + + if sources_num == key_systems_present: + raise ExtractorError('This video is DRM protected', expected=True) + if not formats: # for sonyliv.com DRM protected videos s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') diff --git a/youtube_dlc/extractor/cda.py b/youtube_dlc/extractor/cda.py index 0c3af23d5..d67900e62 100644 --- a/youtube_dlc/extractor/cda.py +++ b/youtube_dlc/extractor/cda.py @@ -5,10 +5,16 @@ import re from .common import InfoExtractor +from ..compat import ( + compat_chr, + compat_ord, + compat_urllib_parse_unquote, +) from ..utils import ( ExtractorError, float_or_none, int_or_none, + merge_dicts, multipart_encode, parse_duration, random_birthday, @@ -107,8 +113,9 @@ def _real_extract(self, url): r'Odsłony:(?:\s| )*([0-9]+)', webpage, 'view_count', default=None) average_rating = self._search_regex( - r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)', - webpage, 'rating', fatal=False, group='rating_value') + (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)', + r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False, + group='rating_value') info_dict = { 'id': video_id, @@ -123,6 +130,24 @@ def _real_extract(self, url): 'age_limit': 18 if need_confirm_age else 0, } + # Source: https://www.cda.pl/js/player.js?t=1606154898 + def decrypt_file(a): + for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): + a = a.replace(p, '') + a = compat_urllib_parse_unquote(a) + b = [] + for c in a: + f = compat_ord(c) + b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f and 126 >= f else compat_chr(f)) + a = ''.join(b) + a = a.replace('.cda.mp4', '') + for p in ('.2cda.pl', '.3cda.pl'): + a = a.replace(p, '.cda.pl') + if '/upstream' in a: + a = a.replace('/upstream', '.mp4/upstream') + return 'https://' + a + return 'https://' + a + '.mp4' + def extract_format(page, version): json_str = self._html_search_regex( r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page, @@ -141,6 +166,8 @@ def extract_format(page, version): video['file'] = codecs.decode(video['file'], 'rot_13') if video['file'].endswith('adc.mp4'): video['file'] = video['file'].replace('adc.mp4', '.mp4') + elif not video['file'].startswith('http'): + video['file'] = decrypt_file(video['file']) f = { 'url': video['file'], } @@ -179,4 +206,6 @@ def extract_format(page, version): self._sort_formats(formats) - return info_dict + info = self._search_json_ld(webpage, video_id, default={}) + + return merge_dicts(info_dict, info) diff --git a/youtube_dlc/extractor/cnbc.py b/youtube_dlc/extractor/cnbc.py index 6889b0f40..7b9f4536a 100644 --- a/youtube_dlc/extractor/cnbc.py +++ b/youtube_dlc/extractor/cnbc.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..utils import smuggle_url @@ -38,7 +39,7 @@ def _real_extract(self, url): class CNBCVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)' + _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)' _TEST = { 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', 'info_dict': { @@ -56,11 +57,15 @@ class CNBCVideoIE(InfoExtractor): } def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id, - 'video id') + path, display_id = re.match(self._VALID_URL, url).groups() + video_id = self._download_json( + 'https://webql-redesign.cnbcfm.com/graphql', display_id, query={ + 'query': '''{ + page(path: "%s") { + vcpsId + } +}''' % path, + })['data']['page']['vcpsId'] return self.url_result( - 'http://video.cnbc.com/gallery/?video=%s' % video_id, + 'http://video.cnbc.com/gallery/?video=%d' % video_id, CNBCIE.ie_key()) diff --git a/youtube_dlc/extractor/common.py b/youtube_dlc/extractor/common.py index 4b42d699f..aacdf06fe 100644 --- a/youtube_dlc/extractor/common.py +++ b/youtube_dlc/extractor/common.py @@ -1456,9 +1456,10 @@ def _is_valid_url(self, url, video_id, item='video', headers={}): try: self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) return True - except ExtractorError: + except ExtractorError as e: self.to_screen( - '%s: %s URL is invalid, skipping' % (video_id, item)) + '%s: %s URL is invalid, skipping: %s' + % (video_id, item, error_to_compat_str(e.cause))) return False def http_scheme(self): @@ -1663,7 +1664,7 @@ def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, # just the media without qualities renditions. # Fortunately, master playlist can be easily distinguished from media # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] - # master playlist tags MUST NOT appear in a media playist and vice versa. + # master playlist tags MUST NOT appear in a media playlist and vice versa. # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every # media playlist and MUST NOT appear in master playlist thus we can # clearly detect media playlist with this criterion. @@ -2596,6 +2597,7 @@ def _media_formats(src, cur_media_type, type_info={}): def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): formats = [] + hdcore_sign = 'hdcore=3.7.0' f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') hds_host = hosts.get('hds') @@ -2608,6 +2610,7 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): for entry in f4m_formats: entry.update({'extra_param_to_segment_url': hdcore_sign}) formats.extend(f4m_formats) + m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') hls_host = hosts.get('hls') if hls_host: @@ -2615,6 +2618,31 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + http_host = hosts.get('http') + if http_host and 'hdnea=' not in manifest_url: + REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+' + qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') + qualities_length = len(qualities) + if len(formats) in (qualities_length + 1, qualities_length * 2 + 1): + i = 0 + http_formats = [] + for f in formats: + if f['protocol'] == 'm3u8_native' and f['vcodec'] != 'none': + for protocol in ('http', 'https'): + http_f = f.copy() + del http_f['manifest_url'] + http_url = re.sub( + REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url']) + http_f.update({ + 'format_id': http_f['format_id'].replace('hls-', protocol + '-'), + 'url': http_url, + 'protocol': protocol, + }) + http_formats.append(http_f) + i += 1 + formats.extend(http_formats) + return formats def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): diff --git a/youtube_dlc/extractor/condenast.py b/youtube_dlc/extractor/condenast.py index ed278fefc..d5e77af32 100644 --- a/youtube_dlc/extractor/condenast.py +++ b/youtube_dlc/extractor/condenast.py @@ -16,6 +16,8 @@ mimetype2ext, orderedSet, parse_iso8601, + strip_or_none, + try_get, ) @@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor): 'uploader': 'gq', 'upload_date': '20170321', 'timestamp': 1490126427, + 'description': 'How much grimmer would things be if these people were competent?', }, }, { # JS embed @@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor): 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', 'uploader': 'arstechnica', 'upload_date': '20150916', - 'timestamp': 1442434955, + 'timestamp': 1442434920, } }, { 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', @@ -196,6 +199,13 @@ def _extract_video(self, params): }) self._sort_formats(formats) + subtitles = {} + for t, caption in video_info.get('captions', {}).items(): + caption_url = caption.get('src') + if not (t in ('vtt', 'srt', 'tml') and caption_url): + continue + subtitles.setdefault('en', []).append({'url': caption_url}) + return { 'id': video_id, 'formats': formats, @@ -208,6 +218,7 @@ def _extract_video(self, params): 'season': video_info.get('season_title'), 'timestamp': parse_iso8601(video_info.get('premiere_date')), 'categories': video_info.get('categories'), + 'subtitles': subtitles, } def _real_extract(self, url): @@ -225,8 +236,16 @@ def _real_extract(self, url): if url_type == 'series': return self._extract_series(url, webpage) else: - params = self._extract_video_params(webpage, display_id) - info = self._search_json_ld( - webpage, display_id, fatal=False) + video = try_get(self._parse_json(self._search_regex( + r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage, + 'preload state', '{}'), display_id), + lambda x: x['transformed']['video']) + if video: + params = {'videoId': video['id']} + info = {'description': strip_or_none(video.get('description'))} + else: + params = self._extract_video_params(webpage, display_id) + info = self._search_json_ld( + webpage, display_id, fatal=False) info.update(self._extract_video(params)) return info diff --git a/youtube_dlc/extractor/discoverynetworks.py b/youtube_dlc/extractor/discoverynetworks.py index 607a54948..c512b95d0 100644 --- a/youtube_dlc/extractor/discoverynetworks.py +++ b/youtube_dlc/extractor/discoverynetworks.py @@ -7,7 +7,7 @@ class DiscoveryNetworksDeIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show)/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)' _TESTS = [{ 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', @@ -29,6 +29,9 @@ class DiscoveryNetworksDeIE(DPlayIE): }, { 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', 'only_matching': True, + }, { + 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dlc/extractor/europa.py b/youtube_dlc/extractor/europa.py index 1efc0b2ec..2c1c747a1 100644 --- a/youtube_dlc/extractor/europa.py +++ b/youtube_dlc/extractor/europa.py @@ -60,7 +60,7 @@ def get_item(type_, preference): title = get_item('title', preferred_langs) or video_id description = get_item('description', preferred_langs) - thumbnmail = xpath_text(playlist, './info/thumburl', 'thumbnail') + thumbnail = xpath_text(playlist, './info/thumburl', 'thumbnail') upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date')) duration = parse_duration(xpath_text(playlist, './info/duration', 'duration')) view_count = int_or_none(xpath_text(playlist, './info/views', 'views')) @@ -85,7 +85,7 @@ def get_item(type_, preference): 'id': video_id, 'title': title, 'description': description, - 'thumbnail': thumbnmail, + 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 01f69c006..f5894504e 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -36,6 +36,7 @@ from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE +from .amara import AmaraIE from .alura import ( AluraIE, AluraCourseIE @@ -62,7 +63,7 @@ ARDMediathekIE, ) from .arte import ( - ArteTVPlus7IE, + ArteTVIE, ArteTVEmbedIE, ArteTVPlaylistIE, ) @@ -116,6 +117,10 @@ BitChuteIE, BitChuteChannelIE, ) +from .bitwave import ( + BitwaveReplayIE, + BitwaveStreamIE, +) from .biqle import BIQLEIE from .bleacherreport import ( BleacherReportIE, @@ -125,6 +130,7 @@ from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bostonglobe import BostonGlobeIE +from .box import BoxIE from .bpb import BpbIE from .br import ( BRIE, @@ -546,6 +552,7 @@ EHFTVIE, ITTFIE, ) +from .lbry import LBRYIE from .lci import LCIIE from .lcp import ( LcpPlayIE, @@ -621,6 +628,7 @@ from .massengeschmacktv import MassengeschmackTVIE from .matchtv import MatchTVIE from .mdr import MDRIE +from .medaltv import MedalTVIE from .mediaset import MediasetIE from .mediasite import ( MediasiteIE, @@ -755,6 +763,7 @@ from .ninegag import NineGagIE from .ninenow import NineNowIE from .nintendo import NintendoIE +from .nitter import NitterIE from .njpwworld import NJPWWorldIE from .nobelprize import NobelPrizeIE from .noco import NocoIE @@ -802,6 +811,7 @@ from .nytimes import ( NYTimesIE, NYTimesArticleIE, + NYTimesCookingIE, ) from .nuvid import NuvidIE from .nzz import NZZIE @@ -864,6 +874,10 @@ ) from .piksel import PikselIE from .pinkbike import PinkbikeIE +from .pinterest import ( + PinterestIE, + PinterestCollectionIE, +) from .pladform import PladformIE from .platzi import ( PlatziIE, @@ -940,6 +954,11 @@ RayWenderlichCourseIE, ) from .rbmaradio import RBMARadioIE +from .rcs import ( + RCSIE, + RCSEmbedsIE, + RCSVariousIE, +) from .rds import RDSIE from .redbulltv import ( RedBullTVIE, @@ -982,6 +1001,7 @@ from .rtvnh import RTVNHIE from .rtvs import RTVSIE from .ruhd import RUHDIE +from .rumble import RumbleEmbedIE from .rutube import ( RutubeIE, RutubeChannelIE, @@ -1041,6 +1061,10 @@ SkyNewsIE, SkySportsIE, ) +from .skyitalia import ( + SkyArteItaliaIE, + SkyItaliaIE, +) from .slideshare import SlideshareIE from .slideslive import SlidesLiveIE from .slutload import SlutloadIE @@ -1078,8 +1102,7 @@ SpankBangPlaylistIE, ) from .spankwire import SpankwireIE -from .spiegel import SpiegelIE, SpiegelArticleIE -from .spiegeltv import SpiegeltvIE +from .spiegel import SpiegelIE from .spike import ( BellatorIE, ParamountNetworkIE, @@ -1093,6 +1116,12 @@ from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE +from .spreaker import ( + SpreakerIE, + SpreakerPageIE, + SpreakerShowIE, + SpreakerShowPageIE, +) from .springboardplatform import SpringboardPlatformIE from .sprout import SproutIE from .srgssr import ( @@ -1174,6 +1203,7 @@ from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE +from .thisvid import ThisVidIE from .threeqsdn import ThreeQSDNIE from .tiktok import TikTokIE from .tinypic import TinyPicIE @@ -1385,8 +1415,8 @@ ) from .vlive import ( VLiveIE, + VLivePostIE, VLiveChannelIE, - VLivePlaylistIE ) from .vodlocker import VodlockerIE from .vodpl import VODPlIE @@ -1503,21 +1533,18 @@ from .yourupload import YourUploadIE from .youtube import ( YoutubeIE, - YoutubeChannelIE, YoutubeFavouritesIE, YoutubeHistoryIE, - YoutubeLiveIE, + YoutubeTabIE, YoutubePlaylistIE, - YoutubePlaylistsIE, YoutubeRecommendedIE, YoutubeSearchDateIE, YoutubeSearchIE, YoutubeSearchURLIE, - YoutubeShowIE, YoutubeSubscriptionsIE, YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, - YoutubeUserIE, + YoutubeYtUserIE, YoutubeWatchLaterIE, ) from .zapiks import ZapiksIE @@ -1543,4 +1570,5 @@ ) from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE +from .zoom import ZoomIE from .zype import ZypeIE diff --git a/youtube_dlc/extractor/franceinter.py b/youtube_dlc/extractor/franceinter.py index 05806895c..ae822a50e 100644 --- a/youtube_dlc/extractor/franceinter.py +++ b/youtube_dlc/extractor/franceinter.py @@ -16,6 +16,7 @@ class FranceInterIE(InfoExtractor): 'ext': 'mp3', 'title': 'Affaire Cahuzac : le contentieux du compte en Suisse', 'description': 'md5:401969c5d318c061f86bda1fa359292b', + 'thumbnail': r're:^https?://.*\.jpg', 'upload_date': '20160907', }, } @@ -31,6 +32,7 @@ def _real_extract(self, url): title = self._og_search_title(webpage) description = self._og_search_description(webpage) + thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) upload_date_str = self._search_regex( r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<', @@ -48,6 +50,7 @@ def _real_extract(self, url): 'id': video_id, 'title': title, 'description': description, + 'thumbnail': thumbnail, 'upload_date': upload_date, 'formats': [{ 'url': video_url, diff --git a/youtube_dlc/extractor/francetv.py b/youtube_dlc/extractor/francetv.py index e340cddba..ab0df1bed 100644 --- a/youtube_dlc/extractor/francetv.py +++ b/youtube_dlc/extractor/francetv.py @@ -17,6 +17,7 @@ parse_duration, try_get, url_or_none, + urljoin, ) from .dailymotion import DailymotionIE @@ -128,18 +129,38 @@ def sign(manifest_url, manifest_id): is_live = None - formats = [] - for video in info['videos']: - if video['statut'] != 'ONLINE': + videos = [] + + for video in (info.get('videos') or []): + if video.get('statut') != 'ONLINE': continue - video_url = video['url'] + if not video.get('url'): + continue + videos.append(video) + + if not videos: + for device_type in ['desktop', 'mobile']: + fallback_info = self._download_json( + 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, + video_id, 'Downloading fallback %s video JSON' % device_type, query={ + 'device_type': device_type, + 'browser': 'chrome', + }, fatal=False) + + if fallback_info and fallback_info.get('video'): + videos.append(fallback_info['video']) + + formats = [] + for video in videos: + video_url = video.get('url') if not video_url: continue if is_live is None: is_live = (try_get( - video, lambda x: x['plages_ouverture'][0]['direct'], - bool) is True) or '/live.francetv.fr/' in video_url - format_id = video['format'] + video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True + or video.get('is_live') is True + or '/live.francetv.fr/' in video_url) + format_id = video.get('format') ext = determine_ext(video_url) if ext == 'f4m': if georestricted: @@ -154,6 +175,9 @@ def sign(manifest_url, manifest_id): sign(video_url, format_id), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False)) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, @@ -166,6 +190,7 @@ def sign(manifest_url, manifest_id): 'url': video_url, 'format_id': format_id, }) + self._sort_formats(formats) title = info['titre'] @@ -185,10 +210,10 @@ def sign(manifest_url, manifest_id): return { 'id': video_id, 'title': self._live_title(title) if is_live else title, - 'description': clean_html(info['synopsis']), - 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']), - 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), - 'timestamp': int_or_none(info['diffusion']['timestamp']), + 'description': clean_html(info.get('synopsis')), + 'thumbnail': urljoin('https://sivideo.webservices.francetelevisions.fr', info.get('image')), + 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')), + 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])), 'is_live': is_live, 'formats': formats, 'subtitles': subtitles, diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py index 3fab929a8..e5d29f316 100644 --- a/youtube_dlc/extractor/generic.py +++ b/youtube_dlc/extractor/generic.py @@ -91,6 +91,7 @@ from .videa import VideaIE from .twentymin import TwentyMinutenIE from .ustream import UstreamIE +from .arte import ArteTVEmbedIE from .videopress import VideoPressIE from .rutube import RutubeIE from .limelight import LimelightBaseIE @@ -120,6 +121,8 @@ from .odnoklassniki import OdnoklassnikiIE from .kinja import KinjaEmbedIE from .gedi import GediEmbedsIE +from .rcs import RCSEmbedsIE +from .bitchute import BitChuteIE class GenericIE(InfoExtractor): @@ -842,7 +845,7 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, - # MTVSercices embed + # MTVServices embed { 'url': 'http://www.vulture.com/2016/06/new-key-peele-sketches-released.html', 'md5': 'ca1aef97695ef2c1d6973256a57e5252', @@ -2761,11 +2764,9 @@ def _real_extract(self, url): return self.url_result(ustream_url, UstreamIE.ie_key()) # Look for embedded arte.tv player - mobj = re.search( - r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'ArteTVEmbed') + arte_urls = ArteTVEmbedIE._extract_urls(webpage) + if arte_urls: + return self.playlist_from_matches(arte_urls, video_id, video_title) # Look for embedded francetv player mobj = re.search( @@ -3220,6 +3221,16 @@ def _real_extract(self, url): return self.playlist_from_matches( gedi_urls, video_id, video_title, ie=GediEmbedsIE.ie_key()) + rcs_urls = RCSEmbedsIE._extract_urls(webpage) + if rcs_urls: + return self.playlist_from_matches( + rcs_urls, video_id, video_title, ie=RCSEmbedsIE.ie_key()) + + bitchute_urls = BitChuteIE._extract_urls(webpage) + if bitchute_urls: + return self.playlist_from_matches( + bitchute_urls, video_id, video_title, ie=BitChuteIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: diff --git a/youtube_dlc/extractor/googledrive.py b/youtube_dlc/extractor/googledrive.py index ec0d58a57..fdb15795a 100644 --- a/youtube_dlc/extractor/googledrive.py +++ b/youtube_dlc/extractor/googledrive.py @@ -3,11 +3,13 @@ import re from .common import InfoExtractor +from ..compat import compat_parse_qs from ..utils import ( determine_ext, ExtractorError, int_or_none, lowercase_escape, + try_get, update_url_query, ) @@ -38,21 +40,10 @@ class GoogleDriveIE(InfoExtractor): # video can't be watched anonymously due to view count limit reached, # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046) 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', - 'md5': 'bfbd670d03a470bb1e6d4a257adec12e', - 'info_dict': { - 'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ', - 'ext': 'mp4', - 'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4', - } + 'only_matching': True, }, { # video id is longer than 28 characters 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', - 'info_dict': { - 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ', - 'ext': 'mp4', - 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4', - 'duration': 189, - }, 'only_matching': True, }, { 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28', @@ -171,23 +162,21 @@ def _get_automatic_captions(self, video_id, subtitles_id, hl): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://docs.google.com/file/d/%s' % video_id, video_id) + video_info = compat_parse_qs(self._download_webpage( + 'https://drive.google.com/get_video_info', + video_id, query={'docid': video_id})) - title = self._search_regex( - r'"title"\s*,\s*"([^"]+)', webpage, 'title', - default=None) or self._og_search_title(webpage) - duration = int_or_none(self._search_regex( - r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', - default=None)) + def get_value(key): + return try_get(video_info, lambda x: x[key][0]) + + reason = get_value('reason') + title = get_value('title') + if not title and reason: + raise ExtractorError(reason, expected=True) formats = [] - fmt_stream_map = self._search_regex( - r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, - 'fmt stream map', default='').split(',') - fmt_list = self._search_regex( - r'"fmt_list"\s*,\s*"([^"]+)', webpage, - 'fmt_list', default='').split(',') + fmt_stream_map = (get_value('fmt_stream_map') or '').split(',') + fmt_list = (get_value('fmt_list') or '').split(',') if fmt_stream_map and fmt_list: resolutions = {} for fmt in fmt_list: @@ -257,19 +246,14 @@ def add_source_format(urlh): if urlh and urlh.headers.get('Content-Disposition'): add_source_format(urlh) - if not formats: - reason = self._search_regex( - r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) - if reason: - raise ExtractorError(reason, expected=True) + if not formats and reason: + raise ExtractorError(reason, expected=True) self._sort_formats(formats) - hl = self._search_regex( - r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None) + hl = get_value('hl') subtitles_id = None - ttsurl = self._search_regex( - r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None) + ttsurl = get_value('ttsurl') if ttsurl: # the video Id for subtitles will be the last value in the ttsurl # query string @@ -281,8 +265,8 @@ def add_source_format(urlh): return { 'id': video_id, 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'duration': duration, + 'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id, + 'duration': int_or_none(get_value('length_seconds')), 'formats': formats, 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl), 'automatic_captions': self.extract_automatic_captions( diff --git a/youtube_dlc/extractor/ina.py b/youtube_dlc/extractor/ina.py index 12695af27..b3b2683cb 100644 --- a/youtube_dlc/extractor/ina.py +++ b/youtube_dlc/extractor/ina.py @@ -12,7 +12,7 @@ class InaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ina\.fr/(?:video|audio)/(?P<id>[A-Z0-9_]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?ina\.fr/(?:video|audio)/(?P<id>[A-Z0-9_]+)' _TESTS = [{ 'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', 'md5': 'a667021bf2b41f8dc6049479d9bb38a3', @@ -31,6 +31,9 @@ class InaIE(InfoExtractor): }, { 'url': 'https://www.ina.fr/video/P16173408-video.html', 'only_matching': True, + }, { + 'url': 'http://m.ina.fr/video/I12055569', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dlc/extractor/infoq.py b/youtube_dlc/extractor/infoq.py index 18249cf9b..0a70a1fb4 100644 --- a/youtube_dlc/extractor/infoq.py +++ b/youtube_dlc/extractor/infoq.py @@ -54,7 +54,7 @@ class InfoQIE(BokeCCBaseIE): def _extract_rtmp_video(self, webpage): # The server URL is hardcoded - video_url = 'rtmpe://video.infoq.com/cfx/st/' + video_url = 'rtmpe://videof.infoq.com/cfx/st/' # Extract video URL encoded_id = self._search_regex( @@ -86,17 +86,18 @@ def _extract_http_video(self, webpage): return [{ 'format_id': 'http_video', 'url': http_video_url, + 'http_headers': {'Referer': 'https://www.infoq.com/'}, }] def _extract_http_audio(self, webpage, video_id): - fields = self._hidden_inputs(webpage) + fields = self._form_hidden_inputs('mp3Form', webpage) http_audio_url = fields.get('filename') if not http_audio_url: return [] # base URL is found in the Location header in the response returned by # GET https://www.infoq.com/mp3download.action?filename=... when logged in. - http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) + http_audio_url = compat_urlparse.urljoin('http://ress.infoq.com/downloads/mp3downloads/', http_audio_url) http_audio_url = update_url_query(http_audio_url, self._extract_cf_auth(webpage)) # audio file seem to be missing some times even if there is a download link diff --git a/youtube_dlc/extractor/instagram.py b/youtube_dlc/extractor/instagram.py index b061850a1..c3eba0114 100644 --- a/youtube_dlc/extractor/instagram.py +++ b/youtube_dlc/extractor/instagram.py @@ -126,16 +126,23 @@ def _real_extract(self, url): uploader_id, like_count, comment_count, comments, height, width) = [None] * 11 - shared_data = self._parse_json( - self._search_regex( - r'window\._sharedData\s*=\s*({.+?});', - webpage, 'shared data', default='{}'), - video_id, fatal=False) + shared_data = try_get(webpage, + (lambda x: self._parse_json( + self._search_regex( + r'window\.__additionalDataLoaded\(\'/(?:p|tv)/(?:[^/?#&]+)/\',({.+?})\);', + x, 'additional data', default='{}'), + video_id, fatal=False), + lambda x: self._parse_json( + self._search_regex( + r'window\._sharedData\s*=\s*({.+?});', + x, 'shared data', default='{}'), + video_id, fatal=False)['entry_data']['PostPage'][0]), + None) if shared_data: media = try_get( shared_data, - (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], - lambda x: x['entry_data']['PostPage'][0]['media']), + (lambda x: x['graphql']['shortcode_media'], + lambda x: x['media']), dict) if media: video_url = media.get('video_url') @@ -144,7 +151,7 @@ def _real_extract(self, url): description = try_get( media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], compat_str) or media.get('caption') - thumbnail = media.get('display_src') + thumbnail = media.get('display_src') or media.get('thumbnail_src') timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) uploader = media.get('owner', {}).get('full_name') uploader_id = media.get('owner', {}).get('username') diff --git a/youtube_dlc/extractor/iqiyi.py b/youtube_dlc/extractor/iqiyi.py index cd11aa70f..5df674daf 100644 --- a/youtube_dlc/extractor/iqiyi.py +++ b/youtube_dlc/extractor/iqiyi.py @@ -150,7 +150,7 @@ def run(self, target, ip, timestamp): elif function in other_functions: other_functions[function]() else: - raise ExtractorError('Unknown funcion %s' % function) + raise ExtractorError('Unknown function %s' % function) return sdk.target diff --git a/youtube_dlc/extractor/itv.py b/youtube_dlc/extractor/itv.py index ad2f4eca5..20144cd82 100644 --- a/youtube_dlc/extractor/itv.py +++ b/youtube_dlc/extractor/itv.py @@ -20,6 +20,7 @@ merge_dicts, parse_duration, smuggle_url, + try_get, url_or_none, xpath_with_ns, xpath_element, @@ -280,12 +281,12 @@ def extract_subtitle(sub_url): class ITVBTCCIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TEST = { - 'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch', + 'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action', 'info_dict': { - 'id': 'btcc-2018-all-the-action-from-brands-hatch', - 'title': 'BTCC 2018: All the action from Brands Hatch', + 'id': 'btcc-2019-brands-hatch-gp-race-action', + 'title': 'BTCC 2019: Brands Hatch GP race action', }, - 'playlist_mincount': 9, + 'playlist_count': 12, } BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s' @@ -294,6 +295,16 @@ def _real_extract(self, url): webpage = self._download_webpage(url, playlist_id) + json_map = try_get(self._parse_json(self._html_search_regex( + '(?s)<script[^>]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)</script>', webpage, 'json_map'), playlist_id), + lambda x: x['props']['pageProps']['article']['body']['content']) or [] + + # Discard empty objects + video_ids = [] + for video in json_map: + if video['data'].get('id'): + video_ids.append(video['data']['id']) + entries = [ self.url_result( smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, { @@ -305,7 +316,7 @@ def _real_extract(self, url): 'referrer': url, }), ie=BrightcoveNewIE.ie_key(), video_id=video_id) - for video_id in re.findall(r'data-video-id=["\'](\d+)', webpage)] + for video_id in video_ids] title = self._og_search_title(webpage, fatal=False) diff --git a/youtube_dlc/extractor/kusi.py b/youtube_dlc/extractor/kusi.py index 6a7e3baa7..9833d35eb 100644 --- a/youtube_dlc/extractor/kusi.py +++ b/youtube_dlc/extractor/kusi.py @@ -64,7 +64,7 @@ def _real_extract(self, url): duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000) description = xpath_text(doc, 'ABSTRACT') thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME') - createtion_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) + creation_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content') formats = [] @@ -84,5 +84,5 @@ def _real_extract(self, url): 'duration': duration, 'formats': formats, 'thumbnail': thumbnail, - 'timestamp': createtion_time, + 'timestamp': creation_time, } diff --git a/youtube_dlc/extractor/la7.py b/youtube_dlc/extractor/la7.py index f5d4564fa..74b006fb5 100644 --- a/youtube_dlc/extractor/la7.py +++ b/youtube_dlc/extractor/la7.py @@ -36,6 +36,9 @@ class LA7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + if not url.startswith('http'): + url = '%s//%s' % (self.http_scheme(), url) + webpage = self._download_webpage(url, video_id) player_data = self._search_regex( diff --git a/youtube_dlc/extractor/lbry.py b/youtube_dlc/extractor/lbry.py new file mode 100644 index 000000000..6177297ab --- /dev/null +++ b/youtube_dlc/extractor/lbry.py @@ -0,0 +1,91 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + mimetype2ext, + try_get, +) + + +class LBRYIE(InfoExtractor): + IE_NAME = 'lbry.tv' + _VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[^:]+:[0-9a-z]+/[^:]+:[0-9a-z])' + _TESTS = [{ + # Video + 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', + 'md5': '65bd7ec1f6744ada55da8e4c48a2edf9', + 'info_dict': { + 'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d', + 'ext': 'mp4', + 'title': 'First day in LBRY? Start HERE!', + 'description': 'md5:f6cb5c704b332d37f5119313c2c98f51', + 'timestamp': 1595694354, + 'upload_date': '20200725', + } + }, { + # Audio + 'url': 'https://lbry.tv/@LBRYFoundation:0/Episode-1:e', + 'md5': 'c94017d3eba9b49ce085a8fad6b98d00', + 'info_dict': { + 'id': 'e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', + 'ext': 'mp3', + 'title': 'The LBRY Foundation Community Podcast Episode 1 - Introduction, Streaming on LBRY, Transcoding', + 'description': 'md5:661ac4f1db09f31728931d7b88807a61', + 'timestamp': 1591312601, + 'upload_date': '20200604', + } + }, { + 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', + 'only_matching': True, + }, { + 'url': "https://odysee.com/@ScammerRevolts:b0/I-SYSKEY'D-THE-SAME-SCAMMERS-3-TIMES!:b", + 'only_matching': True, + }] + + def _call_api_proxy(self, method, display_id, params): + return self._download_json( + 'https://api.lbry.tv/api/v1/proxy', display_id, + headers={'Content-Type': 'application/json-rpc'}, + data=json.dumps({ + 'method': method, + 'params': params, + }).encode())['result'] + + def _real_extract(self, url): + display_id = self._match_id(url).replace(':', '#') + uri = 'lbry://' + display_id + result = self._call_api_proxy( + 'resolve', display_id, {'urls': [uri]})[uri] + result_value = result['value'] + if result_value.get('stream_type') not in ('video', 'audio'): + raise ExtractorError('Unsupported URL', expected=True) + streaming_url = self._call_api_proxy( + 'get', display_id, {'uri': uri})['streaming_url'] + source = result_value.get('source') or {} + media = result_value.get('video') or result_value.get('audio') or {} + signing_channel = result_value.get('signing_channel') or {} + + return { + 'id': result['claim_id'], + 'title': result_value['title'], + 'thumbnail': try_get(result_value, lambda x: x['thumbnail']['url'], compat_str), + 'description': result_value.get('description'), + 'license': result_value.get('license'), + 'timestamp': int_or_none(result.get('timestamp')), + 'tags': result_value.get('tags'), + 'width': int_or_none(media.get('width')), + 'height': int_or_none(media.get('height')), + 'duration': int_or_none(media.get('duration')), + 'channel': signing_channel.get('name'), + 'channel_id': signing_channel.get('claim_id'), + 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')), + 'filesize': int_or_none(source.get('size')), + 'url': streaming_url, + } diff --git a/youtube_dlc/extractor/lrt.py b/youtube_dlc/extractor/lrt.py index f5c997ef4..89d549858 100644 --- a/youtube_dlc/extractor/lrt.py +++ b/youtube_dlc/extractor/lrt.py @@ -5,28 +5,26 @@ from .common import InfoExtractor from ..utils import ( - determine_ext, - int_or_none, - parse_duration, - remove_end, + clean_html, + merge_dicts, ) class LRTIE(InfoExtractor): IE_NAME = 'lrt.lt' - _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P<path>/mediateka/irasas/(?P<id>[0-9]+))' _TESTS = [{ # m3u8 download - 'url': 'http://www.lrt.lt/mediateka/irasas/54391/', - 'md5': 'fe44cf7e4ab3198055f2c598fc175cb0', + 'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene', + 'md5': '85cb2bb530f31d91a9c65b479516ade4', 'info_dict': { - 'id': '54391', + 'id': '2000127261', 'ext': 'mp4', - 'title': 'Septynios Kauno dienos', - 'description': 'md5:24d84534c7dc76581e59f5689462411a', - 'duration': 1783, - 'view_count': int, - 'like_count': int, + 'title': 'Greita ir gardu: Sicilijos įkvėpta klasikinių makaronų su baklažanais vakarienė', + 'description': 'md5:ad7d985f51b0dc1489ba2d76d7ed47fa', + 'duration': 3035, + 'timestamp': 1604079000, + 'upload_date': '20201030', }, }, { # direct mp3 download @@ -43,52 +41,35 @@ class LRTIE(InfoExtractor): }, }] + def _extract_js_var(self, webpage, var_name, default): + return self._search_regex( + r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name, + webpage, var_name.replace('_', ' '), default, group=2) + def _real_extract(self, url): - video_id = self._match_id(url) + path, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id) - title = remove_end(self._og_search_title(webpage), ' - LRT') + media_url = self._extract_js_var(webpage, 'main_url', path) + media = self._download_json(self._extract_js_var( + webpage, 'media_info_url', + 'https://www.lrt.lt/servisai/stream_url/vod/media_info/'), + video_id, query={'url': media_url}) + jw_data = self._parse_jwplayer_data( + media['playlist_item'], video_id, base_url=url) - formats = [] - for _, file_url in re.findall( - r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): - ext = determine_ext(file_url) - if ext not in ('m3u8', 'mp3'): + json_ld_data = self._search_json_ld(webpage, video_id) + + tags = [] + for tag in (media.get('tags') or []): + tag_name = tag.get('name') + if not tag_name: continue - # mp3 served as m3u8 produces stuttered media file - if ext == 'm3u8' and '.mp3' in file_url: - continue - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - file_url, video_id, 'mp4', entry_protocol='m3u8_native', - fatal=False)) - elif ext == 'mp3': - formats.append({ - 'url': file_url, - 'vcodec': 'none', - }) - self._sort_formats(formats) + tags.append(tag_name) - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description(webpage) - duration = parse_duration(self._search_regex( - r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1', - webpage, 'duration', default=None, group='duration')) - - view_count = int_or_none(self._html_search_regex( - r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>', - webpage, 'view count', fatal=False, group='count')) - like_count = int_or_none(self._search_regex( - r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<', - webpage, 'like count', fatal=False, group='count')) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, + clean_info = { + 'description': clean_html(media.get('content')), + 'tags': tags, } + + return merge_dicts(clean_info, jw_data, json_ld_data) diff --git a/youtube_dlc/extractor/mailru.py b/youtube_dlc/extractor/mailru.py index 6fdf70aa6..5bfe40649 100644 --- a/youtube_dlc/extractor/mailru.py +++ b/youtube_dlc/extractor/mailru.py @@ -12,6 +12,7 @@ parse_duration, remove_end, try_get, + urljoin, ) @@ -93,6 +94,14 @@ class MailRuIE(InfoExtractor): { 'url': 'https://my.mail.ru//list//sinyutin10/video/_myvideo/4.html', 'only_matching': True, + }, + { + 'url': 'https://my.mail.ru/mail/cloud-strife/video/embed/Games/2009', + 'only_matching': True, + }, + { + 'url': 'https://videoapi.my.mail.ru/videos/embed/mail/cloud-strife/Games/2009.html', + 'only_matching': True, } ] @@ -110,7 +119,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) page_config = self._parse_json(self._search_regex([ r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>', - r'(?s)"video":\s*(\{.+?\}),'], + r'(?s)"video":\s*({.+?}),'], webpage, 'page config', default='{}'), video_id, fatal=False) if page_config: meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') or page_config.get('metadataUrl') @@ -121,7 +130,7 @@ def _real_extract(self, url): # fix meta_url if missing the host address if re.match(r'^\/\+\/', meta_url): - meta_url = 'https://my.mail.ru' + meta_url + meta_url = urljoin('https://my.mail.ru', meta_url) if meta_url: video_data = self._download_json( diff --git a/youtube_dlc/extractor/malltv.py b/youtube_dlc/extractor/malltv.py index 6f4fd927f..fadfd9338 100644 --- a/youtube_dlc/extractor/malltv.py +++ b/youtube_dlc/extractor/malltv.py @@ -1,10 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import merge_dicts +from ..utils import ( + clean_html, + dict_get, + float_or_none, + int_or_none, + merge_dicts, + parse_duration, + try_get, +) class MallTVIE(InfoExtractor): @@ -17,7 +23,7 @@ class MallTVIE(InfoExtractor): 'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', 'ext': 'mp4', 'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?', - 'description': 'md5:25fc0ec42a72ba602b602c683fa29deb', + 'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35', 'duration': 216, 'timestamp': 1538870400, 'upload_date': '20181007', @@ -37,20 +43,46 @@ def _real_extract(self, url): webpage = self._download_webpage( url, display_id, headers=self.geo_verification_headers()) - SOURCE_RE = r'(<source[^>]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P<id>[\da-z]+)/index)\b' + video = self._parse_json(self._search_regex( + r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);', + webpage, 'video object'), display_id) + video_source = video['VideoSource'] video_id = self._search_regex( - SOURCE_RE, webpage, 'video id', group='id') + r'/([\da-z]+)/index\b', video_source, 'video id') - media = self._parse_html5_media_entries( - url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id, - m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0] + formats = self._extract_m3u8_formats( + video_source + '.m3u8', video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + + subtitles = {} + for s in (video.get('Subtitles') or {}): + s_url = s.get('Url') + if not s_url: + continue + subtitles.setdefault(s.get('Language') or 'cz', []).append({ + 'url': s_url, + }) + + entity_counts = video.get('EntityCounts') or {} + + def get_count(k): + v = entity_counts.get(k + 's') or {} + return int_or_none(dict_get(v, ('Count', 'StrCount'))) info = self._search_json_ld(webpage, video_id, default={}) - return merge_dicts(media, info, { + return merge_dicts({ 'id': video_id, 'display_id': display_id, - 'title': self._og_search_title(webpage, default=None) or display_id, - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - }) + 'title': video.get('Title'), + 'description': clean_html(video.get('Description')), + 'thumbnail': video.get('ThumbnailUrl'), + 'formats': formats, + 'subtitles': subtitles, + 'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')), + 'view_count': get_count('View'), + 'like_count': get_count('Like'), + 'dislike_count': get_count('Dislike'), + 'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])), + 'comment_count': get_count('Comment'), + }, info) diff --git a/youtube_dlc/extractor/medaltv.py b/youtube_dlc/extractor/medaltv.py new file mode 100644 index 000000000..1603b55f6 --- /dev/null +++ b/youtube_dlc/extractor/medaltv.py @@ -0,0 +1,131 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + str_or_none, + try_get, +) + + +class MedalTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://medal.tv/clips/34934644/3Is9zyGMoBMr', + 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', + 'info_dict': { + 'id': '34934644', + 'ext': 'mp4', + 'title': 'Quad Cold', + 'description': 'Medal,https://medal.tv/desktop/', + 'uploader': 'MowgliSB', + 'timestamp': 1603165266, + 'upload_date': '20201020', + 'uploader_id': 10619174, + } + }, { + 'url': 'https://medal.tv/clips/36787208', + 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', + 'info_dict': { + 'id': '36787208', + 'ext': 'mp4', + 'title': 'u tk me i tk u bigger', + 'description': 'Medal,https://medal.tv/desktop/', + 'uploader': 'Mimicc', + 'timestamp': 1605580939, + 'upload_date': '20201117', + 'uploader_id': 5156321, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + hydration_data = self._parse_json(self._search_regex( + r'<script[^>]*>\s*(?:var\s*)?hydrationData\s*=\s*({.+?})\s*</script>', + webpage, 'hydration data', default='{}'), video_id) + + clip = try_get( + hydration_data, lambda x: x['clips'][video_id], dict) or {} + if not clip: + raise ExtractorError( + 'Could not find video information.', video_id=video_id) + + title = clip['contentTitle'] + + source_width = int_or_none(clip.get('sourceWidth')) + source_height = int_or_none(clip.get('sourceHeight')) + + aspect_ratio = source_width / source_height if source_width and source_height else 16 / 9 + + def add_item(container, item_url, height, id_key='format_id', item_id=None): + item_id = item_id or '%dp' % height + if item_id not in item_url: + return + width = int(round(aspect_ratio * height)) + container.append({ + 'url': item_url, + id_key: item_id, + 'width': width, + 'height': height + }) + + formats = [] + thumbnails = [] + for k, v in clip.items(): + if not (v and isinstance(v, compat_str)): + continue + mobj = re.match(r'(contentUrl|thumbnail)(?:(\d+)p)?$', k) + if not mobj: + continue + prefix = mobj.group(1) + height = int_or_none(mobj.group(2)) + if prefix == 'contentUrl': + add_item( + formats, v, height or source_height, + item_id=None if height else 'source') + elif prefix == 'thumbnail': + add_item(thumbnails, v, height, 'id') + + error = clip.get('error') + if not formats and error: + if error == 404: + raise ExtractorError( + 'That clip does not exist.', + expected=True, video_id=video_id) + else: + raise ExtractorError( + 'An unknown error occurred ({0}).'.format(error), + video_id=video_id) + + self._sort_formats(formats) + + # Necessary because the id of the author is not known in advance. + # Won't raise an issue if no profile can be found as this is optional. + author = try_get( + hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {} + author_id = str_or_none(author.get('id')) + author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': clip.get('contentDescription'), + 'uploader': author.get('displayName'), + 'timestamp': float_or_none(clip.get('created'), 1000), + 'uploader_id': author_id, + 'uploader_url': author_url, + 'duration': int_or_none(clip.get('videoLengthSeconds')), + 'view_count': int_or_none(clip.get('views')), + 'like_count': int_or_none(clip.get('likes')), + 'comment_count': int_or_none(clip.get('comments')), + } diff --git a/youtube_dlc/extractor/mgtv.py b/youtube_dlc/extractor/mgtv.py index 71fc3ec56..cab3aa045 100644 --- a/youtube_dlc/extractor/mgtv.py +++ b/youtube_dlc/extractor/mgtv.py @@ -17,9 +17,8 @@ class MGTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html' + _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html' IE_DESC = '芒果TV' - _GEO_COUNTRIES = ['CN'] _TESTS = [{ 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', @@ -34,14 +33,18 @@ class MGTVIE(InfoExtractor): }, { 'url': 'http://www.mgtv.com/b/301817/3826653.html', 'only_matching': True, + }, { + 'url': 'https://w.mgtv.com/b/301817/3826653.html', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + tk2 = base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1] try: api_data = self._download_json( 'https://pcweb.api.mgtv.com/player/video', video_id, query={ - 'tk2': base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1], + 'tk2': tk2, 'video_id': video_id, }, headers=self.geo_verification_headers())['data'] except ExtractorError as e: @@ -56,6 +59,7 @@ def _real_extract(self, url): stream_data = self._download_json( 'https://pcweb.api.mgtv.com/player/getSource', video_id, query={ 'pm2': api_data['atc']['pm2'], + 'tk2': tk2, 'video_id': video_id, }, headers=self.geo_verification_headers())['data'] stream_domain = stream_data['stream_domain'][0] diff --git a/youtube_dlc/extractor/mtv.py b/youtube_dlc/extractor/mtv.py index 6b3658397..d31f53137 100644 --- a/youtube_dlc/extractor/mtv.py +++ b/youtube_dlc/extractor/mtv.py @@ -289,7 +289,7 @@ def _extract_new_triforce_mgid(self, webpage, url='', video_id=None): return mgid - def _extract_mgid(self, webpage, url, data_zone=None): + def _extract_mgid(self, webpage, url, title=None, data_zone=None): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf # or http://media.mtvnservices.com/{mgid} @@ -300,7 +300,8 @@ def _extract_mgid(self, webpage, url, data_zone=None): except RegexNotFoundError: mgid = None - title = self._match_id(url) + if not title: + title = url_basename(url) try: window_data = self._parse_json(self._search_regex( @@ -336,7 +337,7 @@ def _extract_mgid(self, webpage, url, data_zone=None): def _real_extract(self, url): title = url_basename(url) webpage = self._download_webpage(url, title) - mgid = self._extract_mgid(webpage, url) + mgid = self._extract_mgid(webpage, url, title=title) videos_info = self._get_videos_info(mgid, url=url) return videos_info @@ -402,6 +403,18 @@ class MTVIE(MTVServicesInfoExtractor): 'only_matching': True, }] + @staticmethod + def extract_child_with_type(parent, t): + children = parent['children'] + return next(c for c in children if c.get('type') == t) + + def _extract_mgid(self, webpage): + data = self._parse_json(self._search_regex( + r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) + main_container = self.extract_child_with_type(data, 'MainContainer') + video_player = self.extract_child_with_type(main_container, 'VideoPlayer') + return video_player['props']['media']['video']['config']['uri'] + class MTVJapanIE(MTVServicesInfoExtractor): IE_NAME = 'mtvjapan' diff --git a/youtube_dlc/extractor/nbc.py b/youtube_dlc/extractor/nbc.py index 6f3cb3003..ea5f5a315 100644 --- a/youtube_dlc/extractor/nbc.py +++ b/youtube_dlc/extractor/nbc.py @@ -10,7 +10,6 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, - js_to_json, parse_duration, smuggle_url, try_get, @@ -394,8 +393,8 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) data = self._parse_json(self._search_regex( - r'window\.__data\s*=\s*({.+});', webpage, - 'bootstrap json'), video_id, js_to_json) + r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', + webpage, 'bootstrap json'), video_id)['props']['initialState'] video_data = try_get(data, lambda x: x['video']['current'], dict) if not video_data: video_data = data['article']['content'][0]['primaryMedia']['video'] diff --git a/youtube_dlc/extractor/ndr.py b/youtube_dlc/extractor/ndr.py index f3897c71b..81abb3120 100644 --- a/youtube_dlc/extractor/ndr.py +++ b/youtube_dlc/extractor/ndr.py @@ -82,6 +82,29 @@ class NDRIE(NDRBaseIE): 'params': { 'skip_download': True, }, + }, { + # with subtitles + 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html', + 'info_dict': { + 'id': 'extra18674', + 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring', + 'ext': 'mp4', + 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', + 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6', + 'uploader': 'ndrtv', + 'upload_date': '20201113', + 'duration': 1749, + 'subtitles': { + 'de': [{ + 'ext': 'ttml', + 'url': r're:^https://www\.ndr\.de.+', + }], + }, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html', 'only_matching': True, @@ -242,6 +265,20 @@ def _real_extract(self, url): 'preference': quality_key(thumbnail.get('quality')), }) + subtitles = {} + tracks = config.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if not isinstance(track, dict): + continue + track_url = urljoin(url, track.get('src')) + if not track_url: + continue + subtitles.setdefault(track.get('srclang') or 'de', []).append({ + 'url': track_url, + 'ext': 'ttml', + }) + return { 'id': video_id, 'title': title, @@ -251,6 +288,7 @@ def _real_extract(self, url): 'duration': duration, 'thumbnails': thumbnails, 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dlc/extractor/netzkino.py b/youtube_dlc/extractor/netzkino.py index aec3026b1..3d1a06d0b 100644 --- a/youtube_dlc/extractor/netzkino.py +++ b/youtube_dlc/extractor/netzkino.py @@ -13,17 +13,16 @@ class NetzkinoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/(?P<category>[^/]+)/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/[^/]+/(?P<id>[^/]+)' - _TEST = { - 'url': 'http://www.netzkino.de/#!/scifikino/rakete-zum-mond', + _TESTS = [{ + 'url': 'https://www.netzkino.de/#!/scifikino/rakete-zum-mond', 'md5': '92a3f8b76f8d7220acce5377ea5d4873', 'info_dict': { 'id': 'rakete-zum-mond', 'ext': 'mp4', - 'title': 'Rakete zum Mond (Endstation Mond, Destination Moon)', - 'comments': 'mincount:3', - 'description': 'md5:1eddeacc7e62d5a25a2d1a7290c64a28', + 'title': 'Rakete zum Mond \u2013 Jules Verne', + 'description': 'md5:f0a8024479618ddbfa450ff48ffa6c60', 'upload_date': '20120813', 'thumbnail': r're:https?://.*\.jpg$', 'timestamp': 1344858571, @@ -32,17 +31,30 @@ class NetzkinoIE(InfoExtractor): 'params': { 'skip_download': 'Download only works from Germany', } - } + }, { + 'url': 'https://www.netzkino.de/#!/filme/dr-jekyll-mrs-hyde-2', + 'md5': 'c7728b2dadd04ff6727814847a51ef03', + 'info_dict': { + 'id': 'dr-jekyll-mrs-hyde-2', + 'ext': 'mp4', + 'title': 'Dr. Jekyll & Mrs. Hyde 2', + 'description': 'md5:c2e9626ebd02de0a794b95407045d186', + 'upload_date': '20190130', + 'thumbnail': r're:https?://.*\.jpg$', + 'timestamp': 1548849437, + 'age_limit': 18, + }, + 'params': { + 'skip_download': 'Download only works from Germany', + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - category_id = mobj.group('category') video_id = mobj.group('id') - api_url = 'http://api.netzkino.de.simplecache.net/capi-2.0a/categories/%s.json?d=www' % category_id - api_info = self._download_json(api_url, video_id) - info = next( - p for p in api_info['posts'] if p['slug'] == video_id) + api_url = 'https://api.netzkino.de.simplecache.net/capi-2.0a/movies/%s.json?d=www' % video_id + info = self._download_json(api_url, video_id) custom_fields = info['custom_fields'] production_js = self._download_webpage( @@ -67,23 +79,12 @@ def _real_extract(self, url): } for key, tpl in templates.items()] self._sort_formats(formats) - comments = [{ - 'timestamp': parse_iso8601(c.get('date'), delimiter=' '), - 'id': c['id'], - 'author': c['name'], - 'html': c['content'], - 'parent': 'root' if c.get('parent', 0) == 0 else c['parent'], - } for c in info.get('comments', [])] - return { 'id': video_id, 'formats': formats, - 'comments': comments, 'title': info['title'], 'age_limit': int_or_none(custom_fields.get('FSK')[0]), 'timestamp': parse_iso8601(info.get('date'), delimiter=' '), 'description': clean_html(info.get('content')), 'thumbnail': info.get('thumbnail'), - 'playlist_title': api_info.get('title'), - 'playlist_id': category_id, } diff --git a/youtube_dlc/extractor/newgrounds.py b/youtube_dlc/extractor/newgrounds.py index 82e7cf522..b9f01235f 100644 --- a/youtube_dlc/extractor/newgrounds.py +++ b/youtube_dlc/extractor/newgrounds.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, extract_attributes, int_or_none, parse_duration, @@ -20,22 +21,22 @@ class NewgroundsIE(InfoExtractor): 'info_dict': { 'id': '549479', 'ext': 'mp3', - 'title': 'B7 - BusMode', + 'title': 'Burn7 - B7 - BusMode', 'uploader': 'Burn7', 'timestamp': 1378878540, 'upload_date': '20130911', 'duration': 143, }, }, { - 'url': 'https://www.newgrounds.com/portal/view/673111', - 'md5': '3394735822aab2478c31b1004fe5e5bc', + 'url': 'https://www.newgrounds.com/portal/view/1', + 'md5': 'fbfb40e2dc765a7e830cb251d370d981', 'info_dict': { - 'id': '673111', + 'id': '1', 'ext': 'mp4', - 'title': 'Dancin', - 'uploader': 'Squirrelman82', - 'timestamp': 1460256780, - 'upload_date': '20160410', + 'title': 'Brian-Beaton - Scrotum 1', + 'uploader': 'Brian-Beaton', + 'timestamp': 955064100, + 'upload_date': '20000406', }, }, { # source format unavailable, additional mp4 formats @@ -43,7 +44,7 @@ class NewgroundsIE(InfoExtractor): 'info_dict': { 'id': '689400', 'ext': 'mp4', - 'title': 'ZTV News Episode 8', + 'title': 'Bennettthesage - ZTV News Episode 8', 'uploader': 'BennettTheSage', 'timestamp': 1487965140, 'upload_date': '20170224', @@ -55,42 +56,73 @@ class NewgroundsIE(InfoExtractor): def _real_extract(self, url): media_id = self._match_id(url) - + formats = [] + uploader = None webpage = self._download_webpage(url, media_id) title = self._html_search_regex( r'<title>([^>]+)', webpage, 'title') - media_url = self._parse_json(self._search_regex( - r'"url"\s*:\s*("[^"]+"),', webpage, ''), media_id) + media_url_string = self._search_regex( + r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None, fatal=False) - formats = [{ - 'url': media_url, - 'format_id': 'source', - 'quality': 1, - }] + if media_url_string: + media_url = self._parse_json(media_url_string, media_id) + formats = [{ + 'url': media_url, + 'format_id': 'source', + 'quality': 1, + }] - max_resolution = int_or_none(self._search_regex( - r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution', - default=None)) - if max_resolution: - url_base = media_url.rpartition('.')[0] - for resolution in (360, 720, 1080): - if resolution > max_resolution: - break - formats.append({ - 'url': '%s.%dp.mp4' % (url_base, resolution), - 'format_id': '%dp' % resolution, - 'height': resolution, - }) + max_resolution = int_or_none(self._search_regex( + r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution', + default=None)) + if max_resolution: + url_base = media_url.rpartition('.')[0] + for resolution in (360, 720, 1080): + if resolution > max_resolution: + break + formats.append({ + 'url': '%s.%dp.mp4' % (url_base, resolution), + 'format_id': '%dp' % resolution, + 'height': resolution, + }) + else: + video_id = int_or_none(self._search_regex( + r'data-movie-id=\\"([0-9]+)\\"', webpage, '')) + if not video_id: + raise ExtractorError('Could not extract media data') + + url_video_data = 'https://www.newgrounds.com/portal/video/%s' % video_id + headers = { + 'Accept': 'application/json', + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest' + } + json_video = self._download_json(url_video_data, video_id, headers=headers, fatal=False) + if not json_video: + raise ExtractorError('Could not fetch media data') + + uploader = json_video.get('author') + title = json_video.get('title') + media_formats = json_video.get('sources', []) + for media_format in media_formats: + media_sources = media_formats[media_format] + for source in media_sources: + formats.append({ + 'format_id': media_format, + 'quality': int_or_none(media_format[:-1]), + 'url': source.get('src') + }) self._check_formats(formats, media_id) self._sort_formats(formats) - uploader = self._html_search_regex( - (r'(?s)]*>(.+?).*?\s*Author\s*', - r'(?:Author|Writer)\s*]+>([^<]+)'), webpage, 'uploader', - fatal=False) + if not uploader: + uploader = self._html_search_regex( + (r'(?s)]*>(.+?).*?\s*(?:Author|Artist)\s*', + r'(?:Author|Writer)\s*]+>([^<]+)'), webpage, 'uploader', + fatal=False) timestamp = unified_timestamp(self._html_search_regex( (r'
\s*Uploaded\s*
\s*
([^<]+
\s*
[^<]+)', @@ -109,6 +141,9 @@ def _real_extract(self, url): if '
Song' in webpage: formats[0]['vcodec'] = 'none' + if uploader: + title = "%s - %s" % (uploader, title) + return { 'id': media_id, 'title': title, diff --git a/youtube_dlc/extractor/nitter.py b/youtube_dlc/extractor/nitter.py new file mode 100644 index 000000000..3191543ed --- /dev/null +++ b/youtube_dlc/extractor/nitter.py @@ -0,0 +1,167 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + parse_count, + unified_strdate, + unified_timestamp, + remove_end, + determine_ext, +) +import re + + +class NitterIE(InfoExtractor): + # Taken from https://github.com/zedeus/nitter/wiki/Instances + INSTANCES = ('nitter.net', + 'nitter.snopyta.org', + 'nitter.42l.fr', + 'nitter.nixnet.services', + 'nitter.13ad.de', + 'nitter.pussthecat.org', + 'nitter.mastodont.cat', + 'nitter.dark.fail', + 'nitter.tedomum.net', + 'nitter.cattube.org', + 'nitter.fdn.fr', + 'nitter.1d4.us', + 'nitter.kavin.rocks', + 'tweet.lambda.dance', + 'nitter.cc', + 'nitter.weaponizedhumiliation.com', + '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion', + 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion', + 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion') + + _INSTANCES_RE = '(?:' + '|'.join([re.escape(instance) for instance in INSTANCES]) + ')' + _VALID_URL = r'https?://%(instance)s/(?P.+)/status/(?P[0-9]+)(#.)?' % {'instance': _INSTANCES_RE} + current_instance = INSTANCES[0] # the test and official instance + _TESTS = [ + { + # GIF (wrapped in mp4) + 'url': 'https://' + current_instance + '/firefox/status/1314279897502629888#m', + 'info_dict': { + 'id': '1314279897502629888', + 'ext': 'mp4', + 'title': 'Firefox 🔥 - You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg #UnfckTheInternet', + 'description': 'You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg #UnfckTheInternet', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Firefox 🔥', + 'uploader_id': 'firefox', + 'uploader_url': 'https://' + current_instance + '/firefox', + 'upload_date': '20201008', + 'timestamp': 1602183720, + }, + }, { # normal video + 'url': 'https://' + current_instance + '/Le___Doc/status/1299715685392756737#m', + 'info_dict': { + 'id': '1299715685392756737', + 'ext': 'mp4', + 'title': 'Le Doc - "Je ne prédis jamais rien" D Raoult, Août 2020...', + 'description': '"Je ne prédis jamais rien" D Raoult, Août 2020...', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Le Doc', + 'uploader_id': 'Le___Doc', + 'uploader_url': 'https://' + current_instance + '/Le___Doc', + 'upload_date': '20200829', + 'timestamp': 1598711341, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + }, + }, { # video embed in a "Streaming Political Ads" box + 'url': 'https://' + current_instance + '/mozilla/status/1321147074491092994#m', + 'info_dict': { + 'id': '1321147074491092994', + 'ext': 'mp4', + 'title': "Mozilla - Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows? This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. Learn more ➡️ https://mzl.la/StreamingAds", + 'description': "Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows? This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. Learn more ➡️ https://mzl.la/StreamingAds", + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Mozilla', + 'uploader_id': 'mozilla', + 'uploader_url': 'https://' + current_instance + '/mozilla', + 'upload_date': '20201027', + 'timestamp': 1603820982 + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + parsed_url = compat_urlparse.urlparse(url) + base_url = parsed_url.scheme + '://' + parsed_url.netloc + + self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on') + webpage = self._download_webpage(url, video_id) + + video_url = base_url + self._html_search_regex(r'(?:]+data-url|]+src)="([^"]+)"', webpage, 'video url') + ext = determine_ext(video_url) + + if ext == 'unknown_video': + formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4') + else: + formats = [{ + 'url': video_url, + 'ext': ext + }] + + title = ( + self._og_search_description(webpage).replace('\n', ' ') + or self._html_search_regex(r'
]+title="([^"]+)"', webpage, 'uploader name', fatal=False)) + + if uploader_id: + uploader_url = base_url + '/' + uploader_id + + uploader = self._html_search_regex(r']+title="([^"]+)"', webpage, 'uploader name', fatal=False) + + if uploader: + title = uploader + ' - ' + title + + view_count = parse_count(self._html_search_regex(r']+class="icon-play[^>]*>\s([^<]+)
', webpage, 'view count', fatal=False)) + like_count = parse_count(self._html_search_regex(r']+class="icon-heart[^>]*>\s([^<]+)', webpage, 'like count', fatal=False)) + repost_count = parse_count(self._html_search_regex(r']+class="icon-retweet[^>]*>\s([^<]+)', webpage, 'repost count', fatal=False)) + comment_count = parse_count(self._html_search_regex(r']+class="icon-comment[^>]*>\s([^<]+)', webpage, 'repost count', fatal=False)) + + thumbnail = base_url + (self._html_search_meta('og:image', webpage, 'thumbnail url') + or self._html_search_regex(r']+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)) + + thumbnail = remove_end(thumbnail, '%3Asmall') # if parsed with regex, it should contain this + + thumbnails = [] + thumbnail_ids = ('thumb', 'small', 'large', 'medium', 'orig') + for id in thumbnail_ids: + thumbnails.append({ + 'id': id, + 'url': thumbnail + '%3A' + id, + }) + + date = self._html_search_regex(r']+class="tweet-date"[^>]*>]+title="([^"]+)"', webpage, 'upload date', fatal=False) + upload_date = unified_strdate(date) + timestamp = unified_timestamp(date) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'timestamp': timestamp, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + 'view_count': view_count, + 'like_count': like_count, + 'repost_count': repost_count, + 'comment_count': comment_count, + 'formats': formats, + 'thumbnails': thumbnails, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + } diff --git a/youtube_dlc/extractor/npr.py b/youtube_dlc/extractor/npr.py index 53acc6e57..9d1122f0c 100644 --- a/youtube_dlc/extractor/npr.py +++ b/youtube_dlc/extractor/npr.py @@ -33,7 +33,7 @@ class NprIE(InfoExtractor): }, }], }, { - # mutlimedia, not media title + # multimedia, not media title 'url': 'https://www.npr.org/2017/06/19/533198237/tigers-jaw-tiny-desk-concert', 'info_dict': { 'id': '533198237', diff --git a/youtube_dlc/extractor/nrk.py b/youtube_dlc/extractor/nrk.py index 84aacbcda..4a395546f 100644 --- a/youtube_dlc/extractor/nrk.py +++ b/youtube_dlc/extractor/nrk.py @@ -9,6 +9,7 @@ compat_urllib_parse_unquote, ) from ..utils import ( + determine_ext, ExtractorError, int_or_none, js_to_json, @@ -16,17 +17,269 @@ parse_age_limit, parse_duration, try_get, + url_or_none, ) class NRKBaseIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] - _api_host = None + +class NRKIE(NRKBaseIE): + _VALID_URL = r'''(?x) + (?: + nrk:| + https?:// + (?: + (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| + v8[-.]psapi\.nrk\.no/mediaelement/ + ) + ) + (?P[^?\#&]+) + ''' + + _TESTS = [{ + # video + 'url': 'http://www.nrk.no/video/PS*150533', + 'md5': '706f34cdf1322577589e369e522b50ef', + 'info_dict': { + 'id': '150533', + 'ext': 'mp4', + 'title': 'Dompap og andre fugler i Piip-Show', + 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', + 'duration': 262, + } + }, { + # audio + 'url': 'http://www.nrk.no/video/PS*154915', + # MD5 is unstable + 'info_dict': { + 'id': '154915', + 'ext': 'flv', + 'title': 'Slik høres internett ut når du er blind', + 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', + 'duration': 20, + } + }, { + 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', + 'only_matching': True, + }, { + 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', + 'only_matching': True, + }] + + def _extract_from_playback(self, video_id): + manifest = self._download_json( + 'http://psapi.nrk.no/playback/manifest/%s' % video_id, + video_id, 'Downloading manifest JSON') + + playable = manifest['playable'] + + formats = [] + for asset in playable['assets']: + if not isinstance(asset, dict): + continue + if asset.get('encrypted'): + continue + format_url = url_or_none(asset.get('url')) + if not format_url: + continue + if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + data = self._download_json( + 'http://psapi.nrk.no/playback/metadata/%s' % video_id, + video_id, 'Downloading metadata JSON') + + preplay = data['preplay'] + titles = preplay['titles'] + title = titles['title'] + alt_title = titles.get('subtitle') + + description = preplay.get('description') + duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) + + thumbnails = [] + for image in try_get( + preplay, lambda x: x['poster']['images'], list) or []: + if not isinstance(image, dict): + continue + image_url = url_or_none(image.get('url')) + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('pixelWidth')), + 'height': int_or_none(image.get('pixelHeight')), + }) + + return { + 'id': video_id, + 'title': title, + 'alt_title': alt_title, + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats, + } def _real_extract(self, url): video_id = self._match_id(url) + return self._extract_from_playback(video_id) + +class NRKTVIE(NRKBaseIE): + IE_DESC = 'NRK TV and NRK Radio' + _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' + _VALID_URL = r'''(?x) + https?:// + (?:tv|radio)\.nrk(?:super)?\.no/ + (?:serie(?:/[^/]+){1,2}|program)/ + (?![Ee]pisodes)%s + (?:/\d{2}-\d{2}-\d{4})? + (?:\#del=(?P\d+))? + ''' % _EPISODE_RE + _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') + _TESTS = [{ + 'url': 'https://tv.nrk.no/program/MDDP12000117', + 'md5': '8270824df46ec629b66aeaa5796b36fb', + 'info_dict': { + 'id': 'MDDP12000117AA', + 'ext': 'mp4', + 'title': 'Alarm Trolltunga', + 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', + 'duration': 2223, + 'age_limit': 6, + }, + }, { + 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', + 'md5': '9a167e54d04671eb6317a37b7bc8a280', + 'info_dict': { + 'id': 'MUHH48000314AA', + 'ext': 'mp4', + 'title': '20 spørsmål 23.05.2014', + 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', + 'duration': 1741, + 'series': '20 spørsmål', + 'episode': '23.05.2014', + }, + 'skip': 'NoProgramRights', + }, { + 'url': 'https://tv.nrk.no/program/mdfp15000514', + 'info_dict': { + 'id': 'MDFP15000514CA', + 'ext': 'mp4', + 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', + 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', + 'duration': 4605, + 'series': 'Kunnskapskanalen', + 'episode': '24.05.2014', + }, + 'params': { + 'skip_download': True, + }, + }, { + # single playlist video + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', + 'info_dict': { + 'id': 'MSPO40010515-part2', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Video is geo restricted'], + 'skip': 'particular part is not supported currently', + }, { + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', + 'playlist': [{ + 'info_dict': { + 'id': 'MSPO40010515AH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + 'duration': 772, + 'series': 'Tour de Ski', + 'episode': '06.01.2015', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'info_dict': { + 'id': 'MSPO40010515BH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + 'duration': 6175, + 'series': 'Tour de Ski', + 'episode': '06.01.2015', + }, + 'params': { + 'skip_download': True, + }, + }], + 'info_dict': { + 'id': 'MSPO40010515', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + }, + 'expected_warnings': ['Video is geo restricted'], + }, { + 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', + 'info_dict': { + 'id': 'KMTE50001317AA', + 'ext': 'mp4', + 'title': 'Anno 13:30', + 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', + 'duration': 2340, + 'series': 'Anno', + 'episode': '13:30', + 'season_number': 3, + 'episode_number': 13, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', + 'info_dict': { + 'id': 'MUHH46000317AA', + 'ext': 'mp4', + 'title': 'Nytt på Nytt 27.01.2017', + 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', + 'duration': 1796, + 'series': 'Nytt på nytt', + 'episode': '27.01.2017', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', + 'only_matching': True, + }] + + _api_host = None + + def _extract_from_mediaelement(self, video_id): api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS for api_host in api_hosts: @@ -195,190 +448,9 @@ def video_id_and_title(idx): return self.playlist_result(entries, video_id, title, description) - -class NRKIE(NRKBaseIE): - _VALID_URL = r'''(?x) - (?: - nrk:| - https?:// - (?: - (?:www\.)?nrk\.no/video/PS\*| - v8[-.]psapi\.nrk\.no/mediaelement/ - ) - ) - (?P[^?#&]+) - ''' - _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no') - _TESTS = [{ - # video - 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': '706f34cdf1322577589e369e522b50ef', - 'info_dict': { - 'id': '150533', - 'ext': 'mp4', - 'title': 'Dompap og andre fugler i Piip-Show', - 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 262, - } - }, { - # audio - 'url': 'http://www.nrk.no/video/PS*154915', - # MD5 is unstable - 'info_dict': { - 'id': '154915', - 'ext': 'flv', - 'title': 'Slik høres internett ut når du er blind', - 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', - 'duration': 20, - } - }, { - 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', - 'only_matching': True, - }, { - 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }] - - -class NRKTVIE(NRKBaseIE): - IE_DESC = 'NRK TV and NRK Radio' - _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' - _VALID_URL = r'''(?x) - https?:// - (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie(?:/[^/]+){1,2}|program)/ - (?![Ee]pisodes)%s - (?:/\d{2}-\d{2}-\d{4})? - (?:\#del=(?P\d+))? - ''' % _EPISODE_RE - _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') - _TESTS = [{ - 'url': 'https://tv.nrk.no/program/MDDP12000117', - 'md5': '8270824df46ec629b66aeaa5796b36fb', - 'info_dict': { - 'id': 'MDDP12000117AA', - 'ext': 'mp4', - 'title': 'Alarm Trolltunga', - 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', - 'duration': 2223, - 'age_limit': 6, - }, - }, { - 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': '9a167e54d04671eb6317a37b7bc8a280', - 'info_dict': { - 'id': 'MUHH48000314AA', - 'ext': 'mp4', - 'title': '20 spørsmål 23.05.2014', - 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', - 'duration': 1741, - 'series': '20 spørsmål', - 'episode': '23.05.2014', - }, - 'skip': 'NoProgramRights', - }, { - 'url': 'https://tv.nrk.no/program/mdfp15000514', - 'info_dict': { - 'id': 'MDFP15000514CA', - 'ext': 'mp4', - 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', - 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', - 'duration': 4605, - 'series': 'Kunnskapskanalen', - 'episode': '24.05.2014', - }, - 'params': { - 'skip_download': True, - }, - }, { - # single playlist video - 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', - 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Video is geo restricted'], - 'skip': 'particular part is not supported currently', - }, { - 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', - 'playlist': [{ - 'info_dict': { - 'id': 'MSPO40010515AH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 772, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'info_dict': { - 'id': 'MSPO40010515BH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 6175, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }], - 'info_dict': { - 'id': 'MSPO40010515', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - }, - 'expected_warnings': ['Video is geo restricted'], - }, { - 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', - 'info_dict': { - 'id': 'KMTE50001317AA', - 'ext': 'mp4', - 'title': 'Anno 13:30', - 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', - 'duration': 2340, - 'series': 'Anno', - 'episode': '13:30', - 'season_number': 3, - 'episode_number': 13, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', - 'info_dict': { - 'id': 'MUHH46000317AA', - 'ext': 'mp4', - 'title': 'Nytt på Nytt 27.01.2017', - 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', - 'duration': 1796, - 'series': 'Nytt på nytt', - 'episode': '27.01.2017', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', - 'only_matching': True, - }] + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_from_mediaelement(video_id) class NRKTVEpisodeIE(InfoExtractor): diff --git a/youtube_dlc/extractor/nytimes.py b/youtube_dlc/extractor/nytimes.py index fc78ca56c..976b1c694 100644 --- a/youtube_dlc/extractor/nytimes.py +++ b/youtube_dlc/extractor/nytimes.py @@ -221,3 +221,41 @@ def _real_extract(self, url): r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'), webpage, 'podcast data') return self._extract_podcast_from_json(podcast_data, page_id, webpage) + + +class NYTimesCookingIE(NYTimesBaseIE): + _VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P\d+)' + _TESTS = [{ + 'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart', + 'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3', + 'info_dict': { + 'id': '100000004756089', + 'ext': 'mov', + 'timestamp': 1479383008, + 'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON', + 'title': 'Cranberry Tart', + 'upload_date': '20161117', + 'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.', + }, + }, { + 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey', + 'md5': '4b2e8c70530a89b8d905a2b572316eb8', + 'info_dict': { + 'id': '100000003951728', + 'ext': 'mov', + 'timestamp': 1445509539, + 'description': 'Turkey guide', + 'upload_date': '20151022', + 'title': 'Turkey', + } + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + + webpage = self._download_webpage(url, page_id) + + video_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'video id') + + return self._extract_video_from_id(video_id) diff --git a/youtube_dlc/extractor/pbs.py b/youtube_dlc/extractor/pbs.py index 4dbe661be..d4baa16ee 100644 --- a/youtube_dlc/extractor/pbs.py +++ b/youtube_dlc/extractor/pbs.py @@ -477,7 +477,7 @@ def _extract_webpage(self, url): if media_id: return media_id, presumptive_id, upload_date, description - # Fronline video embedded via flp + # Frontline video embedded via flp video_id = self._search_regex( r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None) if video_id: diff --git a/youtube_dlc/extractor/pinterest.py b/youtube_dlc/extractor/pinterest.py new file mode 100644 index 000000000..b249c9eda --- /dev/null +++ b/youtube_dlc/extractor/pinterest.py @@ -0,0 +1,201 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +class PinterestBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' + + def _call_api(self, resource, video_id, options): + return self._download_json( + 'https://www.pinterest.com/resource/%sResource/get/' % resource, + video_id, 'Download %s JSON metadata' % resource, query={ + 'data': json.dumps({'options': options}) + })['resource_response'] + + def _extract_video(self, data, extract_formats=True): + video_id = data['id'] + + title = (data.get('title') or data.get('grid_title') or video_id).strip() + + formats = [] + duration = None + if extract_formats: + for format_id, format_dict in data['videos']['video_list'].items(): + if not isinstance(format_dict, dict): + continue + format_url = url_or_none(format_dict.get('url')) + if not format_url: + continue + duration = float_or_none(format_dict.get('duration'), scale=1000) + ext = determine_ext(format_url) + if 'hls' in format_id.lower() or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'width': int_or_none(format_dict.get('width')), + 'height': int_or_none(format_dict.get('height')), + 'duration': duration, + }) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) + + description = data.get('description') or data.get('description_html') or data.get('seo_description') + timestamp = unified_timestamp(data.get('created_at')) + + def _u(field): + return try_get(data, lambda x: x['closeup_attribution'][field], compat_str) + + uploader = _u('full_name') + uploader_id = _u('id') + + repost_count = int_or_none(data.get('repin_count')) + comment_count = int_or_none(data.get('comment_count')) + categories = try_get(data, lambda x: x['pin_join']['visual_annotation'], list) + tags = data.get('hashtags') + + thumbnails = [] + images = data.get('images') + if isinstance(images, dict): + for thumbnail_id, thumbnail in images.items(): + if not isinstance(thumbnail, dict): + continue + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'repost_count': repost_count, + 'comment_count': comment_count, + 'categories': categories, + 'tags': tags, + 'formats': formats, + 'extractor_key': PinterestIE.ie_key(), + } + + +class PinterestIE(PinterestBaseIE): + _VALID_URL = r'%s/pin/(?P\d+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.com/pin/664281013778109217/', + 'md5': '6550c2af85d6d9f3fe3b88954d1577fc', + 'info_dict': { + 'id': '664281013778109217', + 'ext': 'mp4', + 'title': 'Origami', + 'description': 'md5:b9d90ddf7848e897882de9e73344f7dd', + 'duration': 57.7, + 'timestamp': 1593073622, + 'upload_date': '20200625', + 'uploader': 'Love origami -I am Dafei', + 'uploader_id': '586523688879454212', + 'repost_count': 50, + 'comment_count': 0, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://co.pinterest.com/pin/824721750502199491/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api( + 'Pin', video_id, { + 'field_set_key': 'unauth_react_main_pin', + 'id': video_id, + })['data'] + return self._extract_video(data) + + +class PinterestCollectionIE(PinterestBaseIE): + _VALID_URL = r'%s/(?P[^/]+)/(?P[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', + 'info_dict': { + 'id': '585890301462791043', + 'title': 'cool diys', + }, + 'playlist_count': 8, + }, { + 'url': 'https://www.pinterest.ca/fudohub/videos/', + 'info_dict': { + 'id': '682858430939307450', + 'title': 'VIDEOS', + }, + 'playlist_mincount': 365, + 'skip': 'Test with extract_formats=False', + }] + + @classmethod + def suitable(cls, url): + return False if PinterestIE.suitable(url) else super( + PinterestCollectionIE, cls).suitable(url) + + def _real_extract(self, url): + username, slug = re.match(self._VALID_URL, url).groups() + board = self._call_api( + 'Board', slug, { + 'slug': slug, + 'username': username + })['data'] + board_id = board['id'] + options = { + 'board_id': board_id, + 'page_size': 250, + } + bookmark = None + entries = [] + while True: + if bookmark: + options['bookmarks'] = [bookmark] + board_feed = self._call_api('BoardFeed', board_id, options) + for item in (board_feed.get('data') or []): + if not isinstance(item, dict) or item.get('type') != 'pin': + continue + video_id = item.get('id') + if video_id: + # Some pins may not be available anonymously via pin URL + # video = self._extract_video(item, extract_formats=False) + # video.update({ + # '_type': 'url_transparent', + # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, + # }) + # entries.append(video) + entries.append(self._extract_video(item)) + bookmark = board_feed.get('bookmark') + if not bookmark: + break + return self.playlist_result( + entries, playlist_id=board_id, playlist_title=board.get('name')) diff --git a/youtube_dlc/extractor/rai.py b/youtube_dlc/extractor/rai.py index 51a310f5c..5eef7c633 100644 --- a/youtube_dlc/extractor/rai.py +++ b/youtube_dlc/extractor/rai.py @@ -16,6 +16,7 @@ GeoRestrictedError, int_or_none, parse_duration, + remove_start, strip_or_none, try_get, unified_strdate, @@ -30,7 +31,6 @@ class RaiBaseIE(InfoExtractor): _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' _GEO_COUNTRIES = ['IT'] _GEO_BYPASS = False - _BASE_URL = 'https://www.raiplay.it' def _extract_relinker_info(self, relinker_url, video_id): if not re.match(r'https?://', relinker_url): @@ -68,7 +68,7 @@ def _extract_relinker_info(self, relinker_url, video_id): # This does not imply geo restriction (e.g. # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) - if media_url == 'http://download.rai.it/video_no_available.mp4': + if '/video_no_available.mp4' in media_url: continue ext = determine_ext(media_url) @@ -123,7 +123,7 @@ def _extract_subtitles(url, subtitle_url): class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'(?P(?Phttps?://(?:www\.)?raiplay\.it/.+?-)(?P%s)(?P\.(?:html|json)))' % RaiBaseIE._UUID_RE + _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/.+?-(?P%s))\.(?:html|json)' % RaiBaseIE._UUID_RE _TESTS = [{ 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', @@ -131,11 +131,13 @@ class RaiPlayIE(RaiBaseIE): 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', 'ext': 'mp4', 'title': 'Report del 07/04/2014', - 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014 ', + 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014', 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Rai Gulp', 'duration': 6160, + 'series': 'Report', + 'season': '2013/14', }, 'params': { 'skip_download': True, @@ -146,11 +148,10 @@ class RaiPlayIE(RaiBaseIE): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - url, base, video_id, ext = mobj.group('url', 'base', 'id', 'ext') + base, video_id = re.match(self._VALID_URL, url).groups() media = self._download_json( - '%s%s.json' % (base, video_id), video_id, 'Downloading video JSON') + base + '.json', video_id, 'Downloading video JSON') title = media['name'] video = media['video'] @@ -159,34 +160,39 @@ def _real_extract(self, url): self._sort_formats(relinker_info['formats']) thumbnails = [] - if 'images' in media: - for _, value in media.get('images').items(): - if value: - thumbnails.append({ - 'url': urljoin(RaiBaseIE._BASE_URL, value.replace('[RESOLUTION]', '600x400')) - }) + for _, value in media.get('images', {}).items(): + if value: + thumbnails.append({ + 'url': urljoin(url, value), + }) - timestamp = unified_timestamp(try_get( - media, lambda x: x['availabilities'][0]['start'], compat_str)) + date_published = media.get('date_published') + time_published = media.get('time_published') + if date_published and time_published: + date_published += ' ' + time_published subtitles = self._extract_subtitles(url, video.get('subtitles')) + program_info = media.get('program_info') or {} + season = media.get('season') + info = { - 'id': video_id, + 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, + 'display_id': video_id, 'title': self._live_title(title) if relinker_info.get( 'is_live') else title, - 'alt_title': media.get('subtitle'), + 'alt_title': strip_or_none(media.get('subtitle')), 'description': media.get('description'), 'uploader': strip_or_none(media.get('channel')), - 'creator': strip_or_none(media.get('editor')), + 'creator': strip_or_none(media.get('editor') or None), 'duration': parse_duration(video.get('duration')), - 'timestamp': timestamp, + 'timestamp': unified_timestamp(date_published), 'thumbnails': thumbnails, - 'series': try_get( - media, lambda x: x['isPartOf']['name'], compat_str), - 'season_number': int_or_none(try_get( - media, lambda x: x['isPartOf']['numeroStagioni'])), - 'season': media.get('stagione') or None, + 'series': program_info.get('name'), + 'season_number': int_or_none(season), + 'season': season if (season and not season.isdigit()) else None, + 'episode': media.get('episode_title'), + 'episode_number': int_or_none(media.get('episode')), 'subtitles': subtitles, } @@ -194,9 +200,9 @@ def _real_extract(self, url): return info -class RaiPlayLiveIE(RaiBaseIE): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P[^/?#&]+)' - _TEST = { +class RaiPlayLiveIE(RaiPlayIE): + _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/dirette/(?P[^/?#&]+))' + _TESTS = [{ 'url': 'http://www.raiplay.it/dirette/rainews24', 'info_dict': { 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', @@ -211,40 +217,11 @@ class RaiPlayLiveIE(RaiBaseIE): 'params': { 'skip_download': True, }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - media = self._download_json( - '%s.json' % urljoin(RaiBaseIE._BASE_URL, 'dirette/' + display_id), - display_id, 'Downloading channel JSON') - - title = media['name'] - video = media['video'] - video_id = media['id'].replace('ContentItem-', '') - - relinker_info = self._extract_relinker_info(video['content_url'], video_id) - self._sort_formats(relinker_info['formats']) - - info = { - 'id': video_id, - 'display_id': display_id, - 'title': self._live_title(title) if relinker_info.get( - 'is_live') else title, - 'alt_title': media.get('subtitle'), - 'description': media.get('description'), - 'uploader': strip_or_none(media.get('channel')), - 'creator': strip_or_none(media.get('editor')), - 'duration': parse_duration(video.get('duration')), - } - - info.update(relinker_info) - return info + }] class RaiPlayPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P[^/?#&]+)' + _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/programmi/(?P[^/?#&]+))' _TESTS = [{ 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', 'info_dict': { @@ -256,29 +233,34 @@ class RaiPlayPlaylistIE(InfoExtractor): }] def _real_extract(self, url): - playlist_id = self._match_id(url) + base, playlist_id = re.match(self._VALID_URL, url).groups() - media = self._download_json( - '%s.json' % urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id), - playlist_id, 'Downloading program JSON') - - title = media['name'] - description = media['program_info']['description'] - - content_sets = [s['id'] for b in media['blocks'] for s in b['sets']] + program = self._download_json( + base + '.json', playlist_id, 'Downloading program JSON') entries = [] - for cs in content_sets: - medias = self._download_json( - '%s/%s.json' % (urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id), cs), - cs, 'Downloading content set JSON') - for m in medias['items']: - video_url = urljoin(url, m['path_id']) - entries.append(self.url_result( - video_url, ie=RaiPlayIE.ie_key(), - video_id=RaiPlayIE._match_id(video_url))) + for b in (program.get('blocks') or []): + for s in (b.get('sets') or []): + s_id = s.get('id') + if not s_id: + continue + medias = self._download_json( + '%s/%s.json' % (base, s_id), s_id, + 'Downloading content set JSON', fatal=False) + if not medias: + continue + for m in (medias.get('items') or []): + path_id = m.get('path_id') + if not path_id: + continue + video_url = urljoin(url, path_id) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) - return self.playlist_result(entries, playlist_id, title, description) + return self.playlist_result( + entries, playlist_id, program.get('name'), + try_get(program, lambda x: x['program_info']['description'])) class RaiIE(RaiBaseIE): @@ -294,7 +276,8 @@ class RaiIE(RaiBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1758, 'upload_date': '20140612', - } + }, + 'skip': 'This content is available only in Italy', }, { # with ContentItem in many metas 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', @@ -440,7 +423,7 @@ def _real_extract(self, url): except ExtractorError: pass - relinker_url = self._search_regex( + relinker_url = self._proto_relative_url(self._search_regex( r'''(?x) (?: var\s+videoURL| @@ -452,7 +435,7 @@ def _real_extract(self, url): //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 ''', - webpage, 'relinker URL', group='url') + webpage, 'relinker URL', group='url')) relinker_info = self._extract_relinker_info( urljoin(url, relinker_url), video_id) diff --git a/youtube_dlc/extractor/rcs.py b/youtube_dlc/extractor/rcs.py new file mode 100644 index 000000000..830182c6d --- /dev/null +++ b/youtube_dlc/extractor/rcs.py @@ -0,0 +1,413 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + ExtractorError, + js_to_json, + base_url, + url_basename, + urljoin, +) + + +class RCSBaseIE(InfoExtractor): + _ALL_REPLACE = { + 'media2vam.corriere.it.edgesuite.net': + 'media2vam-corriere-it.akamaized.net', + 'media.youreporter.it.edgesuite.net': + 'media-youreporter-it.akamaized.net', + 'corrierepmd.corriere.it.edgesuite.net': + 'corrierepmd-corriere-it.akamaized.net', + 'media2vam-corriere-it.akamaized.net/fcs.quotidiani/vr/videos/': + 'video.corriere.it/vr360/videos/', + '.net//': '.net/', + } + _MP4_REPLACE = { + 'media2vam.corbologna.corriere.it.edgesuite.net': + 'media2vam-bologna-corriere-it.akamaized.net', + 'media2vam.corfiorentino.corriere.it.edgesuite.net': + 'media2vam-fiorentino-corriere-it.akamaized.net', + 'media2vam.cormezzogiorno.corriere.it.edgesuite.net': + 'media2vam-mezzogiorno-corriere-it.akamaized.net', + 'media2vam.corveneto.corriere.it.edgesuite.net': + 'media2vam-veneto-corriere-it.akamaized.net', + 'media2.oggi.it.edgesuite.net': + 'media2-oggi-it.akamaized.net', + 'media2.quimamme.it.edgesuite.net': + 'media2-quimamme-it.akamaized.net', + 'media2.amica.it.edgesuite.net': + 'media2-amica-it.akamaized.net', + 'media2.living.corriere.it.edgesuite.net': + 'media2-living-corriere-it.akamaized.net', + 'media2.style.corriere.it.edgesuite.net': + 'media2-style-corriere-it.akamaized.net', + 'media2.iodonna.it.edgesuite.net': + 'media2-iodonna-it.akamaized.net', + 'media2.leitv.it.edgesuite.net': + 'media2-leitv-it.akamaized.net', + } + _MIGRATION_MAP = { + 'videoamica-vh.akamaihd': 'amica', + 'media2-amica-it.akamaized': 'amica', + 'corrierevam-vh.akamaihd': 'corriere', + 'media2vam-corriere-it.akamaized': 'corriere', + 'cormezzogiorno-vh.akamaihd': 'corrieredelmezzogiorno', + 'media2vam-mezzogiorno-corriere-it.akamaized': 'corrieredelmezzogiorno', + 'corveneto-vh.akamaihd': 'corrieredelveneto', + 'media2vam-veneto-corriere-it.akamaized': 'corrieredelveneto', + 'corbologna-vh.akamaihd': 'corrieredibologna', + 'media2vam-bologna-corriere-it.akamaized': 'corrieredibologna', + 'corfiorentino-vh.akamaihd': 'corrierefiorentino', + 'media2vam-fiorentino-corriere-it.akamaized': 'corrierefiorentino', + 'corinnovazione-vh.akamaihd': 'corriereinnovazione', + 'media2-gazzanet-gazzetta-it.akamaized': 'gazzanet', + 'videogazzanet-vh.akamaihd': 'gazzanet', + 'videogazzaworld-vh.akamaihd': 'gazzaworld', + 'gazzettavam-vh.akamaihd': 'gazzetta', + 'media2vam-gazzetta-it.akamaized': 'gazzetta', + 'videoiodonna-vh.akamaihd': 'iodonna', + 'media2-leitv-it.akamaized': 'leitv', + 'videoleitv-vh.akamaihd': 'leitv', + 'videoliving-vh.akamaihd': 'living', + 'media2-living-corriere-it.akamaized': 'living', + 'media2-oggi-it.akamaized': 'oggi', + 'videooggi-vh.akamaihd': 'oggi', + 'media2-quimamme-it.akamaized': 'quimamme', + 'quimamme-vh.akamaihd': 'quimamme', + 'videorunning-vh.akamaihd': 'running', + 'media2-style-corriere-it.akamaized': 'style', + 'style-vh.akamaihd': 'style', + 'videostyle-vh.akamaihd': 'style', + 'media2-stylepiccoli-it.akamaized': 'stylepiccoli', + 'stylepiccoli-vh.akamaihd': 'stylepiccoli', + 'doveviaggi-vh.akamaihd': 'viaggi', + 'media2-doveviaggi-it.akamaized': 'viaggi', + 'media2-vivimilano-corriere-it.akamaized': 'vivimilano', + 'vivimilano-vh.akamaihd': 'vivimilano', + 'media2-youreporter-it.akamaized': 'youreporter' + } + _MIGRATION_MEDIA = { + 'advrcs-vh.akamaihd': '', + 'corriere-f.akamaihd': '', + 'corrierepmd-corriere-it.akamaized': '', + 'corrprotetto-vh.akamaihd': '', + 'gazzetta-f.akamaihd': '', + 'gazzettapmd-gazzetta-it.akamaized': '', + 'gazzprotetto-vh.akamaihd': '', + 'periodici-f.akamaihd': '', + 'periodicisecure-vh.akamaihd': '', + 'videocoracademy-vh.akamaihd': '' + } + + def _get_video_src(self, video): + mediaFiles = video.get('mediaProfile').get('mediaFile') + src = {} + # audio + if video.get('mediaType') == 'AUDIO': + for aud in mediaFiles: + # todo: check + src['mp3'] = aud.get('value') + # video + else: + for vid in mediaFiles: + if vid.get('mimeType') == 'application/vnd.apple.mpegurl': + src['m3u8'] = vid.get('value') + if vid.get('mimeType') == 'video/mp4': + src['mp4'] = vid.get('value') + + # replace host + for t in src: + for s, r in self._ALL_REPLACE.items(): + src[t] = src[t].replace(s, r) + for s, r in self._MP4_REPLACE.items(): + src[t] = src[t].replace(s, r) + + # switch cdn + if 'mp4' in src and 'm3u8' in src: + if ('-lh.akamaihd' not in src.get('m3u8') + and 'akamai' in src.get('mp4')): + if 'm3u8' in src: + matches = re.search(r'(?:https*:)?\/\/(?P.*)\.net\/i(?P.*)$', src.get('m3u8')) + src['m3u8'] = 'https://vod.rcsobjects.it/hls/%s%s' % ( + self._MIGRATION_MAP[matches.group('host')], + matches.group('path').replace( + '///', '/').replace( + '//', '/').replace( + '.csmil', '.urlset' + ) + ) + if 'mp4' in src: + matches = re.search(r'(?:https*:)?\/\/(?P.*)\.net\/i(?P.*)$', src.get('mp4')) + if matches: + if matches.group('host') in self._MIGRATION_MEDIA: + vh_stream = 'https://media2.corriereobjects.it' + if src.get('mp4').find('fcs.quotidiani_!'): + vh_stream = 'https://media2-it.corriereobjects.it' + src['mp4'] = '%s%s' % ( + vh_stream, + matches.group('path').replace( + '///', '/').replace( + '//', '/').replace( + '/fcs.quotidiani/mediacenter', '').replace( + '/fcs.quotidiani_!/mediacenter', '').replace( + 'corriere/content/mediacenter/', '').replace( + 'gazzetta/content/mediacenter/', '') + ) + else: + src['mp4'] = 'https://vod.rcsobjects.it/%s%s' % ( + self._MIGRATION_MAP[matches.group('host')], + matches.group('path').replace('///', '/').replace('//', '/') + ) + + if 'mp3' in src: + src['mp3'] = src.get('mp3').replace( + 'media2vam-corriere-it.akamaized.net', + 'vod.rcsobjects.it/corriere') + if 'mp4' in src: + if src.get('mp4').find('fcs.quotidiani_!'): + src['mp4'] = src.get('mp4').replace('vod.rcsobjects', 'vod-it.rcsobjects') + if 'm3u8' in src: + if src.get('m3u8').find('fcs.quotidiani_!'): + src['m3u8'] = src.get('m3u8').replace('vod.rcsobjects', 'vod-it.rcsobjects') + + if 'geoblocking' in video.get('mediaProfile'): + if 'm3u8' in src: + src['m3u8'] = src.get('m3u8').replace('vod.rcsobjects', 'vod-it.rcsobjects') + if 'mp4' in src: + src['mp4'] = src.get('mp4').replace('vod.rcsobjects', 'vod-it.rcsobjects') + if 'm3u8' in src: + if src.get('m3u8').find('csmil') and src.get('m3u8').find('vod'): + src['m3u8'] = src.get('m3u8').replace('.csmil', '.urlset') + + return src + + def _create_formats(self, urls, video_id): + formats = [] + formats = self._extract_m3u8_formats( + urls.get('m3u8'), video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + + if not formats: + formats.append({ + 'format_id': 'http-mp4', + 'url': urls.get('mp4') + }) + self._sort_formats(formats) + return formats + + def _real_extract(self, url): + video_id = self._match_id(url) + mobj = re.search(self._VALID_URL, url) + + if 'cdn' not in mobj.groupdict(): + raise ExtractorError('CDN not found in url: %s' % url) + + # for leitv/youreporter/viaggi don't use the embed page + if ((mobj.group('cdn') not in ['leitv.it', 'youreporter.it']) + and (mobj.group('vid') == 'video')): + url = 'https://video.%s/video-embed/%s' % (mobj.group('cdn'), video_id) + + page = self._download_webpage(url, video_id) + + video_data = None + # look for json video data url + json = self._search_regex( + r'''(?x)var url\s*=\s*["']((?:https?:)? + //video\.rcs\.it + /fragment-includes/video-includes/.+?\.json)["'];''', + page, video_id, default=None) + if json: + if json.startswith('//'): + json = 'https:%s' % json + video_data = self._download_json(json, video_id) + + # if json url not found, look for json video data directly in the page + else: + json = self._search_regex( + r'[\s;]video\s*=\s*({[\s\S]+?})(?:;|,playlist=)', + page, video_id, default=None) + if json: + video_data = self._parse_json( + json, video_id, transform_source=js_to_json) + else: + # if no video data found try search for iframes + emb = RCSEmbedsIE._extract_url(page) + if emb: + return { + '_type': 'url_transparent', + 'url': emb, + 'ie_key': RCSEmbedsIE.ie_key() + } + + if not video_data: + raise ExtractorError('Video data not found in the page') + + formats = self._create_formats( + self._get_video_src(video_data), video_id) + + description = (video_data.get('description') + or clean_html(video_data.get('htmlDescription'))) + uploader = video_data.get('provider') or mobj.group('cdn') + + return { + 'id': video_id, + 'title': video_data.get('title'), + 'description': description, + 'uploader': uploader, + 'formats': formats + } + + +class RCSEmbedsIE(RCSBaseIE): + _VALID_URL = r'''(?x) + https?://(?Pvideo)\. + (?P + (?: + rcs| + (?:corriere\w+\.)?corriere| + (?:gazzanet\.)?gazzetta + )\.it) + /video-embed/(?P[^/=&\?]+?)(?:$|\?)''' + _TESTS = [{ + 'url': 'https://video.rcs.it/video-embed/iodonna-0001585037', + 'md5': '623ecc8ffe7299b2d0c1046d8331a9df', + 'info_dict': { + 'id': 'iodonna-0001585037', + 'ext': 'mp4', + 'title': 'Sky Arte racconta Madonna nella serie "Artist to icon"', + 'description': 'md5:65b09633df9ffee57f48b39e34c9e067', + 'uploader': 'rcs.it', + } + }, { + 'url': 'https://video.gazzanet.gazzetta.it/video-embed/gazzanet-mo05-0000260789', + 'md5': 'a043e3fecbe4d9ed7fc5d888652a5440', + 'info_dict': { + 'id': 'gazzanet-mo05-0000260789', + 'ext': 'mp4', + 'title': 'Valentino Rossi e papà Graziano si divertono col drifting', + 'description': 'md5:a8bf90d6adafd9815f70fc74c0fc370a', + 'uploader': 'rcd', + } + }, { + 'url': 'https://video.corriere.it/video-embed/b727632a-f9d0-11ea-91b0-38d50a849abb?player', + 'match_only': True + }, { + 'url': 'https://video.gazzetta.it/video-embed/49612410-00ca-11eb-bcd8-30d4253e0140', + 'match_only': True + }] + + @staticmethod + def _sanitize_urls(urls): + # add protocol if missing + for i, e in enumerate(urls): + if e.startswith('//'): + urls[i] = 'https:%s' % e + # clean iframes urls + for i, e in enumerate(urls): + urls[i] = urljoin(base_url(e), url_basename(e)) + return urls + + @staticmethod + def _extract_urls(webpage): + entries = [ + mobj.group('url') + for mobj in re.finditer(r'''(?x) + (?: + data-frame-src=| + (?:https?:)?//video\. + (?: + rcs| + (?:corriere\w+\.)?corriere| + (?:gazzanet\.)?gazzetta + ) + \.it/video-embed/.+?) + \1''', webpage)] + return RCSEmbedsIE._sanitize_urls(entries) + + @staticmethod + def _extract_url(webpage): + urls = RCSEmbedsIE._extract_urls(webpage) + return urls[0] if urls else None + + +class RCSIE(RCSBaseIE): + _VALID_URL = r'''(?x)https?://(?Pvideo|viaggi)\. + (?P + (?: + corrieredelmezzogiorno\. + |corrieredelveneto\. + |corrieredibologna\. + |corrierefiorentino\. + )?corriere\.it + |(?:gazzanet\.)?gazzetta\.it) + /(?!video-embed/).+?/(?P[^/\?]+)(?=\?|/$|$)''' + _TESTS = [{ + 'url': 'https://video.corriere.it/sport/formula-1/vettel-guida-ferrari-sf90-mugello-suo-fianco-c-elecrerc-bendato-video-esilarante/b727632a-f9d0-11ea-91b0-38d50a849abb', + 'md5': '0f4ededc202b0f00b6e509d831e2dcda', + 'info_dict': { + 'id': 'b727632a-f9d0-11ea-91b0-38d50a849abb', + 'ext': 'mp4', + 'title': 'Vettel guida la Ferrari SF90 al Mugello e al suo fianco c\'è Leclerc (bendato): il video è esilarante', + 'description': 'md5:93b51c9161ac8a64fb2f997b054d0152', + 'uploader': 'Corriere Tv', + } + }, { + 'url': 'https://viaggi.corriere.it/video/norvegia-il-nuovo-ponte-spettacolare-sopra-la-cascata-di-voringsfossen/', + 'md5': 'da378e4918d2afbf7d61c35abb948d4c', + 'info_dict': { + 'id': '5b7cd134-e2c1-11ea-89b3-b56dd0df2aa2', + 'ext': 'mp4', + 'title': 'La nuova spettacolare attrazione in Norvegia: il ponte sopra Vøringsfossen', + 'description': 'md5:18b35a291f6746c0c8dacd16e5f5f4f8', + 'uploader': 'DOVE Viaggi', + } + }, { + 'url': 'https://video.gazzetta.it/video-motogp-catalogna-cadute-dovizioso-vale-rossi/49612410-00ca-11eb-bcd8-30d4253e0140?vclk=Videobar', + 'md5': 'eedc1b5defd18e67383afef51ff7bdf9', + 'info_dict': { + 'id': '49612410-00ca-11eb-bcd8-30d4253e0140', + 'ext': 'mp4', + 'title': 'Dovizioso, il contatto con Zarco e la caduta. E anche Vale finisce a terra', + 'description': 'md5:8c6e905dc3b9413218beca11ebd69778', + 'uploader': 'AMorici', + } + }, { + 'url': 'https://video.corriere.it/video-360/metro-copenaghen-tutta-italiana/a248a7f0-e2db-11e9-9830-af2de6b1f945', + 'match_only': True + }] + + +class RCSVariousIE(RCSBaseIE): + _VALID_URL = r'''(?x)https?://www\. + (?P + leitv\.it| + youreporter\.it + )/(?:video/)?(?P[^/]+?)(?:$|\?|/)''' + _TESTS = [{ + 'url': 'https://www.leitv.it/video/marmellata-di-ciliegie-fatta-in-casa/', + 'md5': '618aaabac32152199c1af86784d4d554', + 'info_dict': { + 'id': 'marmellata-di-ciliegie-fatta-in-casa', + 'ext': 'mp4', + 'title': 'Marmellata di ciliegie fatta in casa', + 'description': 'md5:89133864d6aad456dbcf6e7a29f86263', + 'uploader': 'leitv.it', + } + }, { + 'url': 'https://www.youreporter.it/fiume-sesia-3-ottobre-2020/', + 'md5': '8dccd436b47a830bab5b4a88232f391a', + 'info_dict': { + 'id': 'fiume-sesia-3-ottobre-2020', + 'ext': 'mp4', + 'title': 'Fiume Sesia 3 ottobre 2020', + 'description': 'md5:0070eef1cc884d13c970a4125063de55', + 'uploader': 'youreporter.it', + } + }] diff --git a/youtube_dlc/extractor/rumble.py b/youtube_dlc/extractor/rumble.py new file mode 100644 index 000000000..4a0225109 --- /dev/null +++ b/youtube_dlc/extractor/rumble.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + parse_iso8601, + try_get, +) + + +class RumbleEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P[0-9a-z]+)' + _TESTS = [{ + 'url': 'https://rumble.com/embed/v5pv5f', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + } + }, { + 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._download_json( + 'https://rumble.com/embedJS/', video_id, + query={'request': 'video', 'v': video_id}) + title = video['title'] + + formats = [] + for height, ua in (video.get('ua') or {}).items(): + for i in range(2): + f_url = try_get(ua, lambda x: x[i], compat_str) + if f_url: + ext = determine_ext(f_url) + f = { + 'ext': ext, + 'format_id': '%s-%sp' % (ext, height), + 'height': int_or_none(height), + 'url': f_url, + } + bitrate = try_get(ua, lambda x: x[i + 2]['bitrate']) + if bitrate: + f['tbr'] = int_or_none(bitrate) + formats.append(f) + self._sort_formats(formats) + + author = video.get('author') or {} + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': video.get('i'), + 'timestamp': parse_iso8601(video.get('pubDate')), + 'channel': author.get('name'), + 'channel_url': author.get('url'), + 'duration': int_or_none(video.get('duration')), + } diff --git a/youtube_dlc/extractor/servus.py b/youtube_dlc/extractor/servus.py index 9401bf2cf..1610ddc2c 100644 --- a/youtube_dlc/extractor/servus.py +++ b/youtube_dlc/extractor/servus.py @@ -1,9 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + unified_timestamp, + urlencode_postdata, + url_or_none, +) class ServusIE(InfoExtractor): @@ -12,20 +18,29 @@ class ServusIE(InfoExtractor): (?:www\.)? (?: servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| - servustv\.com/videos + (?:servustv|pm-wissen)\.com/videos ) /(?P[aA]{2}-\w+|\d+-\d+) ''' _TESTS = [{ # new URL schema 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', - 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', + 'md5': '60474d4c21f3eb148838f215c37f02b9', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', 'ext': 'mp4', 'title': 'Die Grünen aus Sicht des Volkes', + 'alt_title': 'Talk im Hangar-7 Voxpops Gruene', 'description': 'md5:1247204d85783afe3682644398ff2ec4', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 62.442, + 'timestamp': 1605193976, + 'upload_date': '20201112', + 'series': 'Talk im Hangar-7', + 'season': 'Season 9', + 'season_number': 9, + 'episode': 'Episode 31 - September 14', + 'episode_number': 31, } }, { # old URL schema @@ -40,30 +55,94 @@ class ServusIE(InfoExtractor): }, { 'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/', 'only_matching': True, + }, { + 'url': 'https://www.pm-wissen.com/videos/aa-24mus4g2w2112/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url).upper() - webpage = self._download_webpage(url, video_id) - title = self._search_regex( - (r'videoLabel\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', - r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'), - webpage, 'title', default=None, - group='title') or self._og_search_title(webpage) - title = re.sub(r'\s*-\s*Servus TV\s*$', '', title) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) + token = self._download_json( + 'https://auth.redbullmediahouse.com/token', video_id, + 'Downloading token', data=urlencode_postdata({ + 'grant_type': 'client_credentials', + }), headers={ + 'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==', + }) + access_token = token['access_token'] + token_type = token.get('token_type', 'Bearer') - formats = self._extract_m3u8_formats( - 'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id, - video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + video = self._download_json( + 'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id, + video_id, 'Downloading video JSON', headers={ + 'Authorization': '%s %s' % (token_type, access_token), + }) + + formats = [] + thumbnail = None + for resource in video['resources']: + if not isinstance(resource, dict): + continue + format_url = url_or_none(resource.get('url')) + if not format_url: + continue + extension = resource.get('extension') + type_ = resource.get('type') + if extension == 'jpg' or type_ == 'reference_keyframe': + thumbnail = format_url + continue + ext = determine_ext(format_url) + if type_ == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + elif type_ == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif extension == 'mp4' or ext == 'mp4': + formats.append({ + 'url': format_url, + 'format_id': type_, + 'width': int_or_none(resource.get('width')), + 'height': int_or_none(resource.get('height')), + }) self._sort_formats(formats) + attrs = {} + for attribute in video['attributes']: + if not isinstance(attribute, dict): + continue + key = attribute.get('fieldKey') + value = attribute.get('fieldValue') + if not key or not value: + continue + attrs[key] = value + + title = attrs.get('title_stv') or video_id + alt_title = attrs.get('title') + description = attrs.get('long_description') or attrs.get('short_description') + series = attrs.get('label') + season = attrs.get('season') + episode = attrs.get('chapter') + duration = float_or_none(attrs.get('duration'), scale=1000) + season_number = int_or_none(self._search_regex( + r'Season (\d+)', season or '', 'season number', default=None)) + episode_number = int_or_none(self._search_regex( + r'Episode (\d+)', episode or '', 'episode number', default=None)) + return { 'id': video_id, 'title': title, + 'alt_title': alt_title, 'description': description, 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': unified_timestamp(video.get('lastPublished')), + 'series': series, + 'season': season, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, 'formats': formats, } diff --git a/youtube_dlc/extractor/skyitalia.py b/youtube_dlc/extractor/skyitalia.py new file mode 100644 index 000000000..22a6be2be --- /dev/null +++ b/youtube_dlc/extractor/skyitalia.py @@ -0,0 +1,123 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class SkyItaliaBaseIE(InfoExtractor): + _GET_VIDEO_DATA = 'https://apid.sky.it/vdp/v1/getVideoData?token={token}&caller=sky&rendition=web&id={id}' + _RES = { + 'low': [426, 240], + 'med': [640, 360], + 'high': [854, 480], + 'hd': [1280, 720] + } + _GEO_BYPASS = False + + def _extract_video_id(self, url): + webpage = self._download_webpage(url, 'skyitalia') + video_id = self._html_search_regex( + [r'data-videoid=\"(\d+)\"', + r'http://player\.sky\.it/social\?id=(\d+)\&'], + webpage, 'video_id') + if video_id: + return video_id + raise ExtractorError('Video ID not found.') + + def _get_formats(self, video_id, token): + data_url = self._GET_VIDEO_DATA.replace('{id}', video_id) + data_url = data_url.replace('{token}', token) + video_data = self._parse_json( + self._download_webpage(data_url, video_id), + video_id) + + formats = [] + for q, r in self._RES.items(): + key = 'web_%s_url' % q + if key not in video_data: + continue + formats.append({ + 'url': video_data.get(key), + 'format_id': q, + 'width': r[0], + 'height': r[1] + }) + + if not formats and video_data.get('geob') == 1: + self.raise_geo_restricted(countries=['IT']) + + self._sort_formats(formats) + title = video_data.get('title') + thumb = video_data.get('thumb') + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumb, + 'formats': formats + } + + def _real_extract(self, url): + video_id = self._match_id(url) + if video_id == 'None': + video_id = self._extract_video_id(url) + return self._get_formats(video_id, self._TOKEN) + + +class SkyItaliaIE(SkyItaliaBaseIE): + IE_NAME = 'sky.it' + _VALID_URL = r'''(?x)https?:// + (?P<ie>sport|tg24|video) + \.sky\.it/(?:.+?) + (?P<id>[0-9]{6})? + (?:$|\?)''' + + _TESTS = [{ + 'url': 'https://video.sky.it/sport/motogp/video/motogp-gp-emilia-romagna-highlights-prove-libere-616162', + 'md5': '9c03b590b06e5952d8051f0e02b0feca', + 'info_dict': { + 'id': '616162', + 'ext': 'mp4', + 'title': 'MotoGP, GP Emilia Romagna: gli highlights delle prove libere', + 'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/18/1600441214452_hl-libere-motogp-misano2_5602634_thumbnail_1.jpg', + } + }, { + 'url': 'https://sport.sky.it/motogp/2020/09/18/motogp-gp-emilia-romagna-misano-2020-prove-libere-diretta', + 'md5': '9c03b590b06e5952d8051f0e02b0feca', + 'info_dict': { + 'id': '616162', + 'ext': 'mp4', + 'title': 'MotoGP, GP Emilia Romagna: gli highlights delle prove libere', + 'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/18/1600441214452_hl-libere-motogp-misano2_5602634_thumbnail_1.jpg', + } + }, { + 'url': 'https://tg24.sky.it/salute-e-benessere/2020/09/18/coronavirus-vaccino-ue-sanofi', + 'md5': 'caa25e62dadb529bc5e0b078da99f854', + 'info_dict': { + 'id': '615904', + 'ext': 'mp4', + 'title': 'Covid-19, al Buzzi di Milano tamponi drive-in per studenti', + 'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/17/1600351405841_error-coronavirus-al-buzzi-di-milano-tamponi_thumbnail_1.jpg', + } + }, { + 'url': 'https://video.sky.it/sport/motogp/video/motogp-gp-emilia-romagna-highlights-prove-libere-616162?itm_source=parsely-api', + 'only_matching': True, + }] + _TOKEN = 'F96WlOd8yoFmLQgiqv6fNQRvHZcsWk5jDaYnDvhbiJk' + + +class SkyArteItaliaIE(SkyItaliaBaseIE): + IE_NAME = 'arte.sky.it' + _VALID_URL = r'https?://arte\.sky\.it/video/.+?(?P<id>[0-9]{6})?$' + _TEST = { + 'url': 'https://arte.sky.it/video/federico-fellini-maestri-cinema/', + 'md5': '2f22513a89f45142f2746f878d690647', + 'info_dict': { + 'id': '612888', + 'ext': 'mp4', + 'title': 'I maestri del cinema Federico Felini', + 'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/03/1599146747305_i-maestri-del-cinema-federico-felini_thumbnail_1.jpg', + } + } + _TOKEN = 'LWk29hfiU39NNdq87ePeRach3nzTSV20o0lTv2001Cd' diff --git a/youtube_dlc/extractor/soundcloud.py b/youtube_dlc/extractor/soundcloud.py index ed70b7169..47f68bf19 100644 --- a/youtube_dlc/extractor/soundcloud.py +++ b/youtube_dlc/extractor/soundcloud.py @@ -649,7 +649,7 @@ def _real_extract(self, url): class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): def _extract_playlist(self, base_url, playlist_id, playlist_title): - # Per the SoundCloud documentation, the maximum limit for a linked partioning query is 200. + # Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200. # https://developers.soundcloud.com/blog/offset-pagination-deprecated COMMON_QUERY = { 'limit': 200, diff --git a/youtube_dlc/extractor/southpark.py b/youtube_dlc/extractor/southpark.py index 20ae7c5e7..95e6d2890 100644 --- a/youtube_dlc/extractor/southpark.py +++ b/youtube_dlc/extractor/southpark.py @@ -44,7 +44,7 @@ class SouthParkEsIE(SouthParkIE): class SouthParkDeIE(SouthParkIE): IE_NAME = 'southpark.de' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:videoclip|collections|folgen)/(?P<id>(?P<unique_id>.+?)/.+?)(?:\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:(en/(videoclip|collections|episodes))|(videoclip|collections|folgen))/(?P<id>(?P<unique_id>.+?)/.+?)(?:\?|#|$))' # _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' _TESTS = [{ diff --git a/youtube_dlc/extractor/spiegel.py b/youtube_dlc/extractor/spiegel.py index 4df7f4ddc..2da32b9b2 100644 --- a/youtube_dlc/extractor/spiegel.py +++ b/youtube_dlc/extractor/spiegel.py @@ -1,159 +1,54 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .spiegeltv import SpiegeltvIE -from ..compat import compat_urlparse -from ..utils import ( - parse_duration, - strip_or_none, - unified_timestamp, -) +from .jwplatform import JWPlatformIE class SpiegelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', - 'md5': 'b57399839d055fccfeb9a0455c439868', + 'md5': '50c7948883ec85a3e431a0a44b7ad1d6', 'info_dict': { - 'id': '563747', + 'id': 'II0BUyxY', + 'display_id': '1259285', 'ext': 'mp4', - 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', + 'title': 'Vulkan Tungurahua in Ecuador ist wieder aktiv - DER SPIEGEL - Wissenschaft', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', - 'duration': 49, + 'duration': 48.0, 'upload_date': '20130311', - 'timestamp': 1362994320, + 'timestamp': 1362997920, }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', - 'md5': '5b6c2f4add9d62912ed5fc78a1faed80', - 'info_dict': { - 'id': '580988', - 'ext': 'mp4', - 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', - 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', - 'duration': 983, - 'upload_date': '20131115', - 'timestamp': 1384546642, - }, - }, { - 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', - 'md5': '97b91083a672d72976faa8433430afb9', - 'info_dict': { - 'id': '601883', - 'ext': 'mp4', - 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', - 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', - 'upload_date': '20140904', - 'timestamp': 1409834160, - } - }, { - 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', 'only_matching': True, }, { - # nexx video + 'url': 'https://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html', + 'only_matching': True, + }, { + 'url': 'https://www.spiegel.de/panorama/urteile-im-goldmuenzenprozess-haftstrafen-fuer-clanmitglieder-a-aae8df48-43c1-4c61-867d-23f0a2d254b7', + 'only_matching': True, + }, { 'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html', 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id - handle = self._request_webpage(metadata_url, video_id) - - # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html - if SpiegeltvIE.suitable(handle.geturl()): - return self.url_result(handle.geturl(), 'Spiegeltv') - - video_data = self._parse_json(self._webpage_read_content( - handle, metadata_url, video_id), video_id) - title = video_data['title'] - nexx_id = video_data['nexxOmniaId'] - domain_id = video_data.get('nexxOmniaDomain') or '748' - - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': 'nexx:%s:%s' % (domain_id, nexx_id), - 'title': title, - 'description': strip_or_none(video_data.get('teaser')), - 'duration': parse_duration(video_data.get('duration')), - 'timestamp': unified_timestamp(video_data.get('datum')), - 'ie_key': NexxIE.ie_key(), - } - - -class SpiegelArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html' - IE_NAME = 'Spiegel:Article' - IE_DESC = 'Articles on spiegel.de' - _TESTS = [{ + }, { 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html', - 'info_dict': { - 'id': '1516455', - 'ext': 'mp4', - 'title': 'Faszination Badminton: Nennt es bloß nicht Federball', - 'description': 're:^Patrick Kämnitz gehört.{100,}', - 'upload_date': '20140825', - }, - }, { - 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', - 'info_dict': { - - }, - 'playlist_count': 6, - }, { - # Nexx iFrame embed - 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html', - 'info_dict': { - 'id': '161464', - 'ext': 'mp4', - 'title': 'Nervenkitzel Achterbahn', - 'alt_title': 'Karussellbauer in Deutschland', - 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', - 'release_year': 2005, - 'creator': 'SPIEGEL TV', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2761, - 'timestamp': 1394021479, - 'upload_date': '20140305', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - - # Single video on top of the page - video_link = self._search_regex( - r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage, - 'video page URL', default=None) - if video_link: - video_url = compat_urlparse.urljoin( - self.http_scheme() + '//spiegel.de/', video_link) - return self.url_result(video_url) - - # Multiple embedded videos - embeds = re.findall( - r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"', - webpage) - entries = [ - self.url_result(compat_urlparse.urljoin( - self.http_scheme() + '//spiegel.de/', embed_path)) - for embed_path in embeds] - if embeds: - return self.playlist_result(entries) - - return self.playlist_from_matches( - NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key()) + media_id = self._html_search_regex( + r'("|["\'])mediaId\1\s*:\s*("|["\'])(?P<id>(?:(?!\2).)+)\2', + webpage, 'media id', group='id') + return { + '_type': 'url_transparent', + 'id': video_id, + 'display_id': video_id, + 'url': 'jwplatform:%s' % media_id, + 'title': self._og_search_title(webpage, default=None), + 'ie_key': JWPlatformIE.ie_key(), + } diff --git a/youtube_dlc/extractor/spreaker.py b/youtube_dlc/extractor/spreaker.py new file mode 100644 index 000000000..6c7e40ae4 --- /dev/null +++ b/youtube_dlc/extractor/spreaker.py @@ -0,0 +1,176 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + int_or_none, + str_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +def _extract_episode(data, episode_id=None): + title = data['title'] + download_url = data['download_url'] + + series = try_get(data, lambda x: x['show']['title'], compat_str) + uploader = try_get(data, lambda x: x['author']['fullname'], compat_str) + + thumbnails = [] + for image in ('image_original', 'image_medium', 'image'): + image_url = url_or_none(data.get('%s_url' % image)) + if image_url: + thumbnails.append({'url': image_url}) + + def stats(key): + return int_or_none(try_get( + data, + (lambda x: x['%ss_count' % key], + lambda x: x['stats']['%ss' % key]))) + + def duration(key): + return float_or_none(data.get(key), scale=1000) + + return { + 'id': compat_str(episode_id or data['episode_id']), + 'url': download_url, + 'display_id': data.get('permalink'), + 'title': title, + 'description': data.get('description'), + 'timestamp': unified_timestamp(data.get('published_at')), + 'uploader': uploader, + 'uploader_id': str_or_none(data.get('author_id')), + 'creator': uploader, + 'duration': duration('duration') or duration('length'), + 'view_count': stats('play'), + 'like_count': stats('like'), + 'comment_count': stats('message'), + 'format': 'MPEG Layer 3', + 'format_id': 'mp3', + 'container': 'mp3', + 'ext': 'mp3', + 'thumbnails': thumbnails, + 'series': series, + 'extractor_key': SpreakerIE.ie_key(), + } + + +class SpreakerIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + api\.spreaker\.com/ + (?: + (?:download/)?episode| + v2/episodes + )/ + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'https://api.spreaker.com/episode/12534508', + 'info_dict': { + 'id': '12534508', + 'display_id': 'swm-ep15-how-to-market-your-music-part-2', + 'ext': 'mp3', + 'title': 'EP:15 | Music Marketing (Likes) - Part 2', + 'description': 'md5:0588c43e27be46423e183076fa071177', + 'timestamp': 1502250336, + 'upload_date': '20170809', + 'uploader': 'SWM', + 'uploader_id': '9780658', + 'duration': 1063.42, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'series': 'Success With Music (SWM)', + }, + }, { + 'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3', + 'only_matching': True, + }, { + 'url': 'https://api.spreaker.com/v2/episodes/12534508?export=episode_segments', + 'only_matching': True, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + data = self._download_json( + 'https://api.spreaker.com/v2/episodes/%s' % episode_id, + episode_id)['response']['episode'] + return _extract_episode(data, episode_id) + + +class SpreakerPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spreaker\.com/user/[^/]+/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/user/9780658/swm-ep15-how-to-market-your-music-part-2', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + episode_id = self._search_regex( + (r'data-episode_id=["\'](?P<id>\d+)', + r'episode_id\s*:\s*(?P<id>\d+)'), webpage, 'episode id') + return self.url_result( + 'https://api.spreaker.com/episode/%s' % episode_id, + ie=SpreakerIE.ie_key(), video_id=episode_id) + + +class SpreakerShowIE(InfoExtractor): + _VALID_URL = r'https?://api\.spreaker\.com/show/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://api.spreaker.com/show/4652058', + 'info_dict': { + 'id': '4652058', + }, + 'playlist_mincount': 118, + }] + + def _entries(self, show_id): + for page_num in itertools.count(1): + episodes = self._download_json( + 'https://api.spreaker.com/show/%s/episodes' % show_id, + show_id, note='Downloading JSON page %d' % page_num, query={ + 'page': page_num, + 'max_per_page': 100, + }) + pager = try_get(episodes, lambda x: x['response']['pager'], dict) + if not pager: + break + results = pager.get('results') + if not results or not isinstance(results, list): + break + for result in results: + if not isinstance(result, dict): + continue + yield _extract_episode(result) + if page_num == pager.get('last_page'): + break + + def _real_extract(self, url): + show_id = self._match_id(url) + return self.playlist_result(self._entries(show_id), playlist_id=show_id) + + +class SpreakerShowPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spreaker\.com/show/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/show/success-with-music', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + show_id = self._search_regex( + r'show_id\s*:\s*(?P<id>\d+)', webpage, 'show id') + return self.url_result( + 'https://api.spreaker.com/show/%s' % show_id, + ie=SpreakerShowIE.ie_key(), video_id=show_id) diff --git a/youtube_dlc/extractor/svt.py b/youtube_dlc/extractor/svt.py index 2f6887d86..a0b6ef4db 100644 --- a/youtube_dlc/extractor/svt.py +++ b/youtube_dlc/extractor/svt.py @@ -9,6 +9,7 @@ determine_ext, dict_get, int_or_none, + unified_timestamp, str_or_none, strip_or_none, try_get, @@ -44,7 +45,8 @@ def _extract_video(self, video_info, video_id): 'format_id': player_type, 'url': vurl, }) - if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): + rights = try_get(video_info, lambda x: x['rights'], dict) or {} + if not formats and rights.get('geoBlockedSweden'): self.raise_geo_restricted( 'This video is only available in Sweden', countries=self._GEO_COUNTRIES) @@ -70,6 +72,7 @@ def _extract_video(self, video_info, video_id): episode = video_info.get('episodeTitle') episode_number = int_or_none(video_info.get('episodeNumber')) + timestamp = unified_timestamp(rights.get('validFrom')) duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration'))) age_limit = None adult = dict_get( @@ -84,6 +87,7 @@ def _extract_video(self, video_info, video_id): 'formats': formats, 'subtitles': subtitles, 'duration': duration, + 'timestamp': timestamp, 'age_limit': age_limit, 'series': series, 'season_number': season_number, @@ -136,26 +140,39 @@ class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' _VALID_URL = r'''(?x) (?: - svt:(?P<svt_id>[^/?#&]+)| + (?: + svt:| + https?://(?:www\.)?svt\.se/barnkanalen/barnplay/[^/]+/ + ) + (?P<svt_id>[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+) ) ''' _TESTS = [{ - 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', - 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', + 'url': 'https://www.svtplay.se/video/26194546/det-har-ar-himlen', + 'md5': '2382036fd6f8c994856c323fe51c426e', 'info_dict': { - 'id': '5996901', + 'id': 'jNwpV9P', 'ext': 'mp4', - 'title': 'Flygplan till Haile Selassie', - 'duration': 3527, - 'thumbnail': r're:^https?://.*[\.-]jpg$', + 'title': 'Det här är himlen', + 'timestamp': 1586044800, + 'upload_date': '20200405', + 'duration': 3515, + 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', 'age_limit': 0, 'subtitles': { 'sv': [{ - 'ext': 'wsrt', + 'ext': 'vtt', }] }, }, + 'params': { + 'format': 'bestvideo', + # skip for now due to download test asserts that segment is > 10000 bytes and svt uses + # init segments that are smaller + # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B + 'skip_download': True, + }, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -172,6 +189,12 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'svt:14278044', 'only_matching': True, + }, { + 'url': 'https://www.svt.se/barnkanalen/barnplay/kar/eWv5MLX/', + 'only_matching': True, + }, { + 'url': 'svt:eWv5MLX', + 'only_matching': True, }] def _adjust_title(self, info): @@ -236,7 +259,10 @@ def _real_extract(self, url): r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'), webpage, 'video id') - return self._extract_by_video_id(svt_id, webpage) + info_dict = self._extract_by_video_id(svt_id, webpage) + info_dict['thumbnail'] = thumbnail + + return info_dict class SVTSeriesIE(SVTPlayBaseIE): @@ -360,7 +386,7 @@ class SVTPageIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) + return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPageIE, cls).suitable(url) def _real_extract(self, url): path, display_id = re.match(self._VALID_URL, url).groups() diff --git a/youtube_dlc/extractor/tagesschau.py b/youtube_dlc/extractor/tagesschau.py index c351b7545..8ceab7e35 100644 --- a/youtube_dlc/extractor/tagesschau.py +++ b/youtube_dlc/extractor/tagesschau.py @@ -86,7 +86,7 @@ def _real_extract(self, url): # return self._extract_via_api(kind, video_id) # JSON api does not provide some audio formats (e.g. ogg) thus - # extractiong audio via webpage + # extracting audio via webpage webpage = self._download_webpage(url, video_id) diff --git a/youtube_dlc/extractor/theplatform.py b/youtube_dlc/extractor/theplatform.py index 07055513a..41bfbe80f 100644 --- a/youtube_dlc/extractor/theplatform.py +++ b/youtube_dlc/extractor/theplatform.py @@ -208,7 +208,7 @@ def _extract_urls(cls, webpage): if m: return [m.group('url')] - # Are whitesapces ignored in URLs? + # Are whitespaces ignored in URLs? # https://github.com/ytdl-org/youtube-dl/issues/12044 matches = re.findall( r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage) diff --git a/youtube_dlc/extractor/thisvid.py b/youtube_dlc/extractor/thisvid.py new file mode 100644 index 000000000..f507e1b06 --- /dev/null +++ b/youtube_dlc/extractor/thisvid.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from .common import InfoExtractor + + +class ThisVidIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+/?)' + _TESTS = [{ + 'url': 'https://thisvid.com/videos/french-boy-pantsed/', + 'md5': '3397979512c682f6b85b3b04989df224', + 'info_dict': { + 'id': '2400174', + 'ext': 'mp4', + 'title': 'French Boy Pantsed', + 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', + 'age_limit': 18, + } + }, { + 'url': 'https://thisvid.com/embed/2400174/', + 'md5': '3397979512c682f6b85b3b04989df224', + 'info_dict': { + 'id': '2400174', + 'ext': 'mp4', + 'title': 'French Boy Pantsed', + 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + main_id = self._match_id(url) + webpage = self._download_webpage(url, main_id) + + # URL decryptor was reversed from version 4.0.4, later verified working with 5.2.0 and may change in the future. + kvs_version = self._html_search_regex(r'<script [^>]+?src="https://thisvid\.com/player/kt_player\.js\?v=(\d+(\.\d+)+)">', webpage, 'kvs_version', fatal=False) + if not kvs_version.startswith("5."): + self.report_warning("Major version change (" + kvs_version + ") in player engine--Download may fail.") + + title = self._html_search_regex(r'<title>(?:Video: )?(.+?)(?: - (?:\w+ porn at )?ThisVid(?:.com| tube))?', webpage, 'title') + # video_id, video_url and license_code from the 'flashvars' JSON object: + video_id = self._html_search_regex(r"video_id: '([0-9]+)',", webpage, 'video_id') + video_url = self._html_search_regex(r"video_url: '(function/0/.+?)',", webpage, 'video_url') + license_code = self._html_search_regex(r"license_code: '([0-9$]{16})',", webpage, 'license_code') + thumbnail = self._html_search_regex(r"preview_url: '((?:https?:)?//media.thisvid.com/.+?.jpg)',", webpage, 'thumbnail', fatal=False) + if thumbnail.startswith("//"): + thumbnail = "https:" + thumbnail + if (re.match(self._VALID_URL, url).group('type') == "videos"): + display_id = main_id + else: + display_id = self._search_regex(r'', webpage, 'display_id', fatal=False), + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'url': getrealurl(video_url, license_code), + 'thumbnail': thumbnail, + 'age_limit': 18, + } + + +def getrealurl(video_url, license_code): + urlparts = video_url.split('/')[2:] + license = getlicensetoken(license_code) + newmagic = urlparts[5][:32] + + for o in range(len(newmagic) - 1, -1, -1): + new = "" + l = (o + sum([int(n) for n in license[o:]])) % 32 + + for i in range(0, len(newmagic)): + if i == o: + new += newmagic[l] + elif i == l: + new += newmagic[o] + else: + new += newmagic[i] + newmagic = new + + urlparts[5] = newmagic + urlparts[5][32:] + return "/".join(urlparts) + + +def getlicensetoken(license): + modlicense = license.replace("$", "").replace("0", "1") + center = int(len(modlicense) / 2) + fronthalf = int(modlicense[:center + 1]) + backhalf = int(modlicense[center:]) + + modlicense = str(4 * abs(fronthalf - backhalf)) + retval = "" + for o in range(0, center + 1): + for i in range(1, 5): + retval += str((int(license[o + i]) + int(modlicense[o])) % 10) + return retval diff --git a/youtube_dlc/extractor/turner.py b/youtube_dlc/extractor/turner.py index 4a6cbfbb8..2964504a2 100644 --- a/youtube_dlc/extractor/turner.py +++ b/youtube_dlc/extractor/turner.py @@ -56,9 +56,9 @@ def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}): content_id = xpath_text(video_data, 'contentId') or video_id # rtmp_src = xpath_text(video_data, 'akamai/src') # if rtmp_src: - # splited_rtmp_src = rtmp_src.split(',') - # if len(splited_rtmp_src) == 2: - # rtmp_src = splited_rtmp_src[1] + # split_rtmp_src = rtmp_src.split(',') + # if len(split_rtmp_src) == 2: + # rtmp_src = split_rtmp_src[1] # aifp = xpath_text(video_data, 'akamai/aifp', default='') urls = [] diff --git a/youtube_dlc/extractor/tvland.py b/youtube_dlc/extractor/tvland.py index 791144128..225b6b078 100644 --- a/youtube_dlc/extractor/tvland.py +++ b/youtube_dlc/extractor/tvland.py @@ -3,6 +3,8 @@ from .spike import ParamountNetworkIE +# TODO: Remove - Reason not used anymore - Service moved to youtube + class TVLandIE(ParamountNetworkIE): IE_NAME = 'tvland.com' diff --git a/youtube_dlc/extractor/twentythreevideo.py b/youtube_dlc/extractor/twentythreevideo.py index aa0c6e90f..dc5609192 100644 --- a/youtube_dlc/extractor/twentythreevideo.py +++ b/youtube_dlc/extractor/twentythreevideo.py @@ -8,8 +8,8 @@ class TwentyThreeVideoIE(InfoExtractor): IE_NAME = '23video' - _VALID_URL = r'https?://video\.(?Ptwentythree\.net|23video\.com|filmweb\.no)/v\.ihtml/player\.html\?(?P.*?\bphoto(?:_|%5f)id=(?P\d+).*)' - _TEST = { + _VALID_URL = r'https?://(?P[^.]+\.(?:twentythree\.net|23video\.com|filmweb\.no))/v\.ihtml/player\.html\?(?P.*?\bphoto(?:_|%5f)id=(?P\d+).*)' + _TESTS = [{ 'url': 'https://video.twentythree.net/v.ihtml/player.html?showDescriptions=0&source=site&photo%5fid=20448876&autoPlay=1', 'md5': '75fcf216303eb1dae9920d651f85ced4', 'info_dict': { @@ -21,11 +21,14 @@ class TwentyThreeVideoIE(InfoExtractor): 'uploader_id': '12258964', 'uploader': 'Rasmus Bysted', } - } + }, { + 'url': 'https://bonnier-publications-danmark.23video.com/v.ihtml/player.html?token=f0dc46476e06e13afd5a1f84a29e31e8&source=embed&photo%5fid=36137620', + 'only_matching': True, + }] def _real_extract(self, url): domain, query, photo_id = re.match(self._VALID_URL, url).groups() - base_url = 'https://video.%s' % domain + base_url = 'https://%s' % domain photo_data = self._download_json( base_url + '/api/photo/list?' + query, photo_id, query={ 'format': 'json', diff --git a/youtube_dlc/extractor/urplay.py b/youtube_dlc/extractor/urplay.py index 6030b7cb5..2c41f78bd 100644 --- a/youtube_dlc/extractor/urplay.py +++ b/youtube_dlc/extractor/urplay.py @@ -2,7 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_timestamp +from ..utils import ( + dict_get, + int_or_none, + unified_timestamp, +) class URPlayIE(InfoExtractor): @@ -15,8 +19,8 @@ class URPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd', 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', - 'timestamp': 1513512768, - 'upload_date': '20171217', + 'timestamp': 1513292400, + 'upload_date': '20171214', }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -25,7 +29,7 @@ class URPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'Tripp, Trapp, Träd : Sovkudde', 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', - 'timestamp': 1440093600, + 'timestamp': 1440086400, 'upload_date': '20150820', }, }, { @@ -35,37 +39,65 @@ class URPlayIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - + url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) - urplayer_data = self._parse_json(self._search_regex( - r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id) - host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] + urplayer_data = self._parse_json(self._html_search_regex( + r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['currentProduct'] + episode = urplayer_data['title'] + host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] formats = [] - for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)): - file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr) + urplayer_streams = urplayer_data.get('streamingInfo', {}) + + for k, v in urplayer_streams.get('raw', {}).items(): + if not (k in ('sd', 'hd') and isinstance(v, dict)): + continue + file_http = v.get('location') if file_http: formats.extend(self._extract_wowza_formats( - 'http://%s/%splaylist.m3u8' % (host, file_http), video_id, skip_protocols=['rtmp', 'rtsp'])) + 'http://%s/%splaylist.m3u8' % (host, file_http), + video_id, skip_protocols=['f4m', 'rtmp', 'rtsp'])) self._sort_formats(formats) subtitles = {} - for subtitle in urplayer_data.get('subtitles', []): - subtitle_url = subtitle.get('file') - kind = subtitle.get('kind') - if not subtitle_url or (kind and kind != 'captions'): - continue - subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({ - 'url': subtitle_url, + subs = urplayer_streams.get("sweComplete", {}).get("tt", {}).get("location") + if subs: + subtitles.setdefault('Svenska', []).append({ + 'url': subs, }) + image = urplayer_data.get('image') or {} + thumbnails = [] + for k, v in image.items(): + t = { + 'id': k, + 'url': v, + } + wh = k.split('x') + if len(wh) == 2: + t.update({ + 'width': int_or_none(wh[0]), + 'height': int_or_none(wh[1]), + }) + thumbnails.append(t) + + series = urplayer_data.get('series') or {} + series_title = dict_get(series, ('seriesTitle', 'title')) or dict_get(urplayer_data, ('seriesTitle', 'mainTitle')) + return { 'id': video_id, - 'title': urplayer_data['title'], - 'description': self._og_search_description(webpage), - 'thumbnail': urplayer_data.get('image'), - 'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), webpage, 'timestamp')), - 'series': urplayer_data.get('series_title'), 'subtitles': subtitles, + 'title': '%s : %s' % (series_title, episode) if series_title else episode, + 'description': urplayer_data.get('description'), + 'thumbnails': thumbnails, + 'timestamp': unified_timestamp(urplayer_data.get('publishedAt')), + 'series': series_title, 'formats': formats, + 'duration': int_or_none(urplayer_data.get('duration')), + 'categories': urplayer_data.get('categories'), + 'tags': urplayer_data.get('keywords'), + 'season': series.get('label'), + 'episode': episode, + 'episode_number': int_or_none(urplayer_data.get('episodeNumber')), } diff --git a/youtube_dlc/extractor/usanetwork.py b/youtube_dlc/extractor/usanetwork.py index 54c7495cc..d953e460b 100644 --- a/youtube_dlc/extractor/usanetwork.py +++ b/youtube_dlc/extractor/usanetwork.py @@ -1,74 +1,24 @@ # coding: utf-8 from __future__ import unicode_literals -from .adobepass import AdobePassIE -from ..utils import ( - NO_DEFAULT, - smuggle_url, - update_url_query, -) +from .nbc import NBCIE -class USANetworkIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P[^/?#]+)' - _TEST = { - 'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity', - 'md5': '33c0d2ba381571b414024440d08d57fd', +class USANetworkIE(NBCIE): + _VALID_URL = r'https?(?P://(?:www\.)?usanetwork\.com/(?:[^/]+/videos?|movies?)/(?:[^/]+/)?(?P\d+))' + _TESTS = [{ + 'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302', 'info_dict': { - 'id': '3086229', + 'id': '4185302', 'ext': 'mp4', - 'title': 'HPE Cybersecurity', - 'description': 'The more we digitize our world, the more vulnerable we are.', - 'upload_date': '20160818', - 'timestamp': 1471535460, - 'uploader': 'NBCU-USA', + 'title': 'Intelligence (Trailer)', + 'description': 'A maverick NSA agent enlists the help of a junior systems analyst in a workplace power grab.', + 'upload_date': '20200715', + 'timestamp': 1594785600, + 'uploader': 'NBCU-MPAT', }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def _x(name, default=NO_DEFAULT): - return self._search_regex( - r'data-%s\s*=\s*(["\'])(?P(?:(?!\1).)+)\1' % name, - webpage, name, default=default, group='value') - - video_id = _x('mpx-guid') - title = _x('episode-title') - mpx_account_id = _x('mpx-account-id', '2304992029') - - query = { - 'mbr': 'true', - } - if _x('is-full-episode', None) == '1': - query['manifest'] = 'm3u' - - if _x('is-entitlement', None) == '1': - adobe_pass = {} - drupal_settings = self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings', fatal=False) - if drupal_settings: - drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False) - if drupal_settings: - adobe_pass = drupal_settings.get('adobePass', {}) - resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId', 'usa'), - title, video_id, _x('episode-rating', 'TV-14')) - query['auth'] = self._extract_mvpd_auth( - url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource) - - info = self._search_json_ld(webpage, video_id, default={}) - info.update({ - '_type': 'url_transparent', - 'url': smuggle_url(update_url_query( - 'http://link.theplatform.com/s/HNK2IC/media/guid/%s/%s' % (mpx_account_id, video_id), - query), {'force_smil_url': True}), - 'id': video_id, - 'title': title, - 'series': _x('show-title', None), - 'episode': title, - 'ie_key': 'ThePlatform', - }) - return info + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] diff --git a/youtube_dlc/extractor/ustream.py b/youtube_dlc/extractor/ustream.py index 582090d0d..9e860aeb7 100644 --- a/youtube_dlc/extractor/ustream.py +++ b/youtube_dlc/extractor/ustream.py @@ -19,7 +19,7 @@ class UstreamIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ustream\.tv/(?Precorded|embed|embed/recorded)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/(?Precorded|embed|embed/recorded)/(?P\d+)' IE_NAME = 'ustream' _TESTS = [{ 'url': 'http://www.ustream.tv/recorded/20274954', @@ -67,12 +67,15 @@ class UstreamIE(InfoExtractor): 'params': { 'skip_download': True, # m3u8 download }, + }, { + 'url': 'https://video.ibm.com/embed/recorded/128240221?&autoplay=true&controls=true&volume=100', + 'only_matching': True, }] @staticmethod def _extract_url(webpage): mobj = re.search( - r']+?src=(["\'])(?Phttp://www\.ustream\.tv/embed/.+?)\1', webpage) + r']+?src=(["\'])(?Phttp://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage) if mobj is not None: return mobj.group('url') diff --git a/youtube_dlc/extractor/viki.py b/youtube_dlc/extractor/viki.py index f8e360338..09da4338d 100644 --- a/youtube_dlc/extractor/viki.py +++ b/youtube_dlc/extractor/viki.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import hashlib import hmac import itertools @@ -9,6 +10,10 @@ import time from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( ExtractorError, int_or_none, @@ -16,6 +21,7 @@ parse_age_limit, parse_iso8601, sanitized_Request, + std_headers, ) @@ -166,19 +172,20 @@ class VikiIE(VikiBaseIE): }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '5fa476a902e902783ac7a4d615cdbc7a', + 'md5': '94e0e34fd58f169f40c184f232356cfe', 'info_dict': { 'id': '44699v', 'ext': 'mp4', 'title': 'Boys Over Flowers - Episode 1', 'description': 'md5:b89cf50038b480b88b5b3c93589a9076', - 'duration': 4204, + 'duration': 4172, 'timestamp': 1270496524, 'upload_date': '20100405', 'uploader': 'group8', 'like_count': int, 'age_limit': 13, - } + }, + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # youtube external 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', @@ -195,14 +202,15 @@ class VikiIE(VikiBaseIE): 'uploader_id': 'ad14065n', 'like_count': int, 'age_limit': 13, - } + }, + 'skip': 'Page not found!', }, { 'url': 'http://www.viki.com/player/44699v', 'only_matching': True, }, { # non-English description 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': '1713ae35df5a521b31f6dc40730e7c9c', + 'md5': 'adf9e321a0ae5d0aace349efaaff7691', 'info_dict': { 'id': '158036v', 'ext': 'mp4', @@ -218,71 +226,13 @@ class VikiIE(VikiBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - video = self._call_api( - 'videos/%s.json' % video_id, video_id, 'Downloading video JSON') - - streams = self._call_api( - 'videos/%s/streams.json' % video_id, video_id, - 'Downloading video streams JSON') - - formats = [] - for format_id, stream_dict in streams.items(): - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)) - for protocol, format_dict in stream_dict.items(): - # rtmps URLs does not seem to work - if protocol == 'rtmps': - continue - format_url = format_dict.get('url') - format_drms = format_dict.get('drms') - format_stream_id = format_dict.get('id') - if format_id == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', - m3u8_id='m3u8-%s' % protocol, fatal=False) - # Despite CODECS metadata in m3u8 all video-only formats - # are actually video+audio - for f in m3u8_formats: - if f.get('acodec') == 'none' and f.get('vcodec') != 'none': - f['acodec'] = None - formats.extend(m3u8_formats) - elif format_id == 'mpd': - mpd_formats = self._extract_mpd_formats( - format_url, video_id, - mpd_id='mpd-%s' % protocol, fatal=False) - formats.extend(mpd_formats) - elif format_id == 'mpd': - - formats.extend(mpd_formats) - elif format_url.startswith('rtmp'): - mobj = re.search( - r'^(?Prtmp://[^/]+/(?P.+?))/(?Pmp4:.+)$', - format_url) - if not mobj: - continue - formats.append({ - 'format_id': 'rtmp-%s' % format_id, - 'ext': 'flv', - 'url': mobj.group('url'), - 'play_path': mobj.group('playpath'), - 'app': mobj.group('app'), - 'page_url': url, - 'drms': format_drms, - 'stream_id': format_stream_id, - }) - else: - urlh = self._request_webpage( - HEADRequest(format_url), video_id, 'Checking file size', fatal=False) - formats.append({ - 'url': format_url, - 'format_id': '%s-%s' % (format_id, protocol), - 'height': height, - 'drms': format_drms, - 'stream_id': format_stream_id, - 'filesize': int_or_none(urlh.headers.get('Content-Length')), - }) - self._sort_formats(formats) + resp = self._download_json( + 'https://www.viki.com/api/videos/' + video_id, + video_id, 'Downloading video JSON', headers={ + 'x-client-user-agent': std_headers['User-Agent'], + 'x-viki-app-ver': '4.0.57', + }) + video = resp['video'] self._check_errors(video) @@ -308,19 +258,26 @@ def _real_extract(self, url): 'url': thumbnail.get('url'), }) - stream_ids = [] - for f in formats: - s_id = f.get('stream_id') - if s_id is not None: - stream_ids.append(s_id) - subtitles = {} - for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): - subtitles[subtitle_lang] = [{ - 'ext': subtitles_format, - 'url': self._prepare_call( - 'videos/%s/subtitles/%s.%s?stream_id=%s' % (video_id, subtitle_lang, subtitles_format, stream_ids[0])), - } for subtitles_format in ('srt', 'vtt')] + try: + # New way to fetch subtitles + new_video = self._download_json( + 'https://www.viki.com/api/videos/%s' % video_id, video_id, + 'Downloading new video JSON to get subtitles', headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404]) + for sub in new_video.get('streamSubtitles').get('dash'): + subtitles[sub.get('srclang')] = [{ + 'ext': 'vtt', + 'url': sub.get('src'), + 'completion': sub.get('percentage'), + }] + except AttributeError: + # fall-back to the old way if there isn't a streamSubtitles attribute + for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): + subtitles[subtitle_lang] = [{ + 'ext': subtitles_format, + 'url': self._prepare_call( + 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)), + } for subtitles_format in ('srt', 'vtt')] result = { 'id': video_id, @@ -335,12 +292,84 @@ def _real_extract(self, url): 'subtitles': subtitles, } - if 'external' in streams: - result.update({ - '_type': 'url_transparent', - 'url': streams['external']['url'], - }) - return result + formats = [] + + def add_format(format_id, format_dict, protocol='http'): + # rtmps URLs does not seem to work + if protocol == 'rtmps': + return + format_url = format_dict.get('url') + if not format_url: + return + format_drms = format_dict.get('drms') + format_stream_id = format_dict.get('id') + qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query) + stream = qs.get('stream', [None])[0] + if stream: + format_url = base64.b64decode(stream).decode() + if format_id in ('m3u8', 'hls'): + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', + m3u8_id='m3u8-%s' % protocol, fatal=False) + # Despite CODECS metadata in m3u8 all video-only formats + # are actually video+audio + for f in m3u8_formats: + if '_drm/index_' in f['url']: + continue + if f.get('acodec') == 'none' and f.get('vcodec') != 'none': + f['acodec'] = None + formats.append(f) + elif format_id in ('mpd', 'dash'): + formats.extend(self._extract_mpd_formats( + format_url, video_id, 'mpd-%s' % protocol, fatal=False)) + elif format_url.startswith('rtmp'): + mobj = re.search( + r'^(?Prtmp://[^/]+/(?P.+?))/(?Pmp4:.+)$', + format_url) + if not mobj: + return + formats.append({ + 'format_id': 'rtmp-%s' % format_id, + 'ext': 'flv', + 'url': mobj.group('url'), + 'play_path': mobj.group('playpath'), + 'app': mobj.group('app'), + 'page_url': url, + 'drms': format_drms, + 'stream_id': format_stream_id, + }) + else: + urlh = self._request_webpage( + HEADRequest(format_url), video_id, 'Checking file size', fatal=False) + formats.append({ + 'url': format_url, + 'format_id': '%s-%s' % (format_id, protocol), + 'height': int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)), + 'drms': format_drms, + 'stream_id': format_stream_id, + 'filesize': int_or_none(urlh.headers.get('Content-Length')), + }) + + for format_id, format_dict in (resp.get('streams') or {}).items(): + add_format(format_id, format_dict) + if not formats: + streams = self._call_api( + 'videos/%s/streams.json' % video_id, video_id, + 'Downloading video streams JSON') + + if 'external' in streams: + result.update({ + '_type': 'url_transparent', + 'url': streams['external']['url'], + }) + return result + + for format_id, stream_dict in streams.items(): + for protocol, format_dict in stream_dict.items(): + add_format(format_id, format_dict, protocol) + self._sort_formats(formats) result['formats'] = formats return result diff --git a/youtube_dlc/extractor/vimeo.py b/youtube_dlc/extractor/vimeo.py index 9839657ca..51a0ab2fa 100644 --- a/youtube_dlc/extractor/vimeo.py +++ b/youtube_dlc/extractor/vimeo.py @@ -922,7 +922,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): }] _PAGE_SIZE = 100 - def _fetch_page(self, album_id, authorizaion, hashed_pass, page): + def _fetch_page(self, album_id, authorization, hashed_pass, page): api_page = page + 1 query = { 'fields': 'link,uri', @@ -934,7 +934,7 @@ def _fetch_page(self, album_id, authorizaion, hashed_pass, page): videos = self._download_json( 'https://api.vimeo.com/albums/%s/videos' % album_id, album_id, 'Downloading page %d' % api_page, query=query, headers={ - 'Authorization': 'jwt ' + authorizaion, + 'Authorization': 'jwt ' + authorization, })['data'] for video in videos: link = video.get('link') @@ -946,10 +946,13 @@ def _fetch_page(self, album_id, authorizaion, hashed_pass, page): def _real_extract(self, url): album_id = self._match_id(url) - webpage = self._download_webpage(url, album_id) - viewer = self._parse_json(self._search_regex( - r'bootstrap_data\s*=\s*({.+?})', - webpage, 'bootstrap data'), album_id)['viewer'] + viewer = self._download_json( + 'https://vimeo.com/_rv/viewer', album_id, fatal=False) + if not viewer: + webpage = self._download_webpage(url, album_id) + viewer = self._parse_json(self._search_regex( + r'bootstrap_data\s*=\s*({.+?})', + webpage, 'bootstrap data'), album_id)['viewer'] jwt = viewer['jwt'] album = self._download_json( 'https://api.vimeo.com/albums/' + album_id, diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index f79531e6f..c07550810 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -1,25 +1,32 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import time import itertools +import json -from .common import InfoExtractor from .naver import NaverBaseIE -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( ExtractorError, + int_or_none, merge_dicts, - remove_start, + str_or_none, + strip_or_none, try_get, urlencode_postdata, ) -class VLiveIE(NaverBaseIE): +class VLiveBaseIE(NaverBaseIE): + _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + + +class VLiveIE(VLiveBaseIE): IE_NAME = 'vlive' - _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P[0-9]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P[0-9]+)' _NETRC_MACHINE = 'vlive' _TESTS = [{ 'url': 'http://www.vlive.tv/video/1326', @@ -27,7 +34,7 @@ class VLiveIE(NaverBaseIE): 'info_dict': { 'id': '1326', 'ext': 'mp4', - 'title': "[V LIVE] Girl's Day's Broadcast", + 'title': "Girl's Day's Broadcast", 'creator': "Girl's Day", 'view_count': int, 'uploader_id': 'muploader_a', @@ -37,7 +44,7 @@ class VLiveIE(NaverBaseIE): 'info_dict': { 'id': '16937', 'ext': 'mp4', - 'title': '[V LIVE] 첸백시 걍방', + 'title': '첸백시 걍방', 'creator': 'EXO', 'view_count': int, 'subtitles': 'mincount:12', @@ -58,12 +65,15 @@ class VLiveIE(NaverBaseIE): 'subtitles': 'mincount:10', }, 'skip': 'This video is only available for CH+ subscribers', + }, { + 'url': 'https://www.vlive.tv/embed/1326', + 'only_matching': True, + }, { + # works only with gcc=KR + 'url': 'https://www.vlive.tv/video/225019', + 'only_matching': True, }] - @classmethod - def suitable(cls, url): - return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url) - def _real_initialize(self): self._login() @@ -95,173 +105,199 @@ def is_logged_in(): if not is_logged_in(): raise ExtractorError('Unable to log in', expected=True) + def _call_api(self, path_template, video_id, fields=None): + query = {'appId': self._APP_ID, 'gcc': 'KR'} + if fields: + query['fields'] = fields + try: + return self._download_json( + 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id, + 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0], + headers={'Referer': 'https://www.vlive.tv/'}, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_login_required(json.loads(e.cause.read().decode())['message']) + raise + def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://www.vlive.tv/video/%s' % video_id, video_id) + post = self._call_api( + 'post/v1.0/officialVideoPost-%s', video_id, + 'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId}') - VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)' - VIDEO_PARAMS_FIELD = 'video params' + video = post['officialVideo'] - params = self._parse_json(self._search_regex( - VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD, default=''), video_id, - transform_source=lambda s: '[' + s + ']', fatal=False) + def get_common_fields(): + channel = post.get('channel') or {} + return { + 'title': video.get('title'), + 'creator': post.get('author', {}).get('nickname'), + 'channel': channel.get('channelName'), + 'channel_id': channel.get('channelCode'), + 'duration': int_or_none(video.get('playTime')), + 'view_count': int_or_none(video.get('playCount')), + 'like_count': int_or_none(video.get('likeCount')), + 'comment_count': int_or_none(video.get('commentCount')), + } - if not params or len(params) < 7: - params = self._search_regex( - VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD) - params = [p.strip(r'"') for p in re.split(r'\s*,\s*', params)] - - status, long_video_id, key = params[2], params[5], params[6] - status = remove_start(status, 'PRODUCT_') - - if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'): - return self._live(video_id, webpage) - elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'): - return self._replay(video_id, webpage, long_video_id, key) - - if status == 'LIVE_END': - raise ExtractorError('Uploading for replay. Please wait...', - expected=True) - elif status == 'COMING_SOON': - raise ExtractorError('Coming soon!', expected=True) - elif status == 'CANCELED': - raise ExtractorError('We are sorry, ' - 'but the live broadcast has been canceled.', - expected=True) - elif status == 'ONLY_APP': - raise ExtractorError('Unsupported video type', expected=True) - else: - raise ExtractorError('Unknown status %s' % status) - - def _get_common_fields(self, webpage): - title = self._og_search_title(webpage) - creator = self._html_search_regex( - r']+class="info_area"[^>]*>\s*(?:]*>.*?\s*)?]*>([^<]+)', - webpage, 'creator', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) - return { - 'title': title, - 'creator': creator, - 'thumbnail': thumbnail, - } - - def _live(self, video_id, webpage): - init_page = self._download_init_page(video_id) - - live_params = self._search_regex( - r'"liveStreamInfo"\s*:\s*(".*"),', - init_page, 'live stream info') - live_params = self._parse_json(live_params, video_id) - live_params = self._parse_json(live_params, video_id) - - formats = [] - for vid in live_params.get('resolutions', []): - formats.extend(self._extract_m3u8_formats( - vid['cdnUrl'], video_id, 'mp4', - m3u8_id=vid.get('name'), - fatal=False, live=True)) - self._sort_formats(formats) - - info = self._get_common_fields(webpage) - info.update({ - 'title': self._live_title(info['title']), - 'id': video_id, - 'formats': formats, - 'is_live': True, - }) - return info - - def _replay(self, video_id, webpage, long_video_id, key): - if '' in (long_video_id, key): - init_page = self._download_init_page(video_id) - video_info = self._parse_json(self._search_regex( - (r'(?s)oVideoStatus\s*=\s*({.+?})\s*\d-\d+)' + _TESTS = [{ + # uploadType = SOS + 'url': 'https://www.vlive.tv/post/1-20088044', + 'info_dict': { + 'id': '1-20088044', + 'title': 'Hola estrellitas la tierra les dice hola (si era así no?) Ha...', + 'description': 'md5:fab8a1e50e6e51608907f46c7fa4b407', + }, + 'playlist_count': 3, + }, { + # uploadType = V + 'url': 'https://www.vlive.tv/post/1-20087926', + 'info_dict': { + 'id': '1-20087926', + 'title': 'James Corden: And so, the baby becamos the Papa💜😭💪😭', + }, + 'playlist_count': 1, + }] + _FVIDEO_TMPL = 'fvideo/v1.0/fvideo-%%s/%s' + _SOS_TMPL = _FVIDEO_TMPL % 'sosPlayInfo' + _INKEY_TMPL = _FVIDEO_TMPL % 'inKey' + + def _real_extract(self, url): + post_id = self._match_id(url) + + post = self._call_api( + 'post/v1.0/post-%s', post_id, + 'attachments{video},officialVideo{videoSeq},plainBody,title') + + video_seq = str_or_none(try_get( + post, lambda x: x['officialVideo']['videoSeq'])) + if video_seq: + return self.url_result( + 'http://www.vlive.tv/video/' + video_seq, + VLiveIE.ie_key(), video_seq) + + title = post['title'] + entries = [] + for idx, video in enumerate(post['attachments']['video'].values()): + video_id = video.get('videoId') + if not video_id: + continue + upload_type = video.get('uploadType') + upload_info = video.get('uploadInfo') or {} + entry = None + if upload_type == 'SOS': + download = self._call_api( + self._SOS_TMPL, video_id)['videoUrl']['download'] + formats = [] + for f_id, f_url in download.items(): + formats.append({ + 'format_id': f_id, + 'url': f_url, + 'height': int_or_none(f_id[:-1]), + }) + self._sort_formats(formats) + entry = { + 'formats': formats, + 'id': video_id, + 'thumbnail': upload_info.get('imageUrl'), + } + elif upload_type == 'V': + vod_id = upload_info.get('videoId') + if not vod_id: + continue + inkey = self._call_api(self._INKEY_TMPL, video_id)['inKey'] + entry = self._extract_video_info(video_id, vod_id, inkey) + if entry: + entry['title'] = '%s_part%s' % (title, idx) + entries.append(entry) + return self.playlist_result( + entries, post_id, title, strip_or_none(post.get('plainBody'))) + + +class VLiveChannelIE(VLiveBaseIE): IE_NAME = 'vlive:channel' - _VALID_URL = r'https?://channels\.vlive\.tv/(?P[0-9A-Z]+)' - _TEST = { + _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P[0-9A-Z]+)' + _TESTS = [{ 'url': 'http://channels.vlive.tv/FCD4B', 'info_dict': { 'id': 'FCD4B', 'title': 'MAMAMOO', }, 'playlist_mincount': 110 - } - _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + }, { + 'url': 'https://www.vlive.tv/channel/FCD4B', + 'only_matching': True, + }] + + def _call_api(self, path, channel_key_suffix, channel_value, note, query): + q = { + 'app_id': self._APP_ID, + 'channel' + channel_key_suffix: channel_value, + } + q.update(query) + return self._download_json( + 'http://api.vfan.vlive.tv/vproxy/channelplus/' + path, + channel_value, note='Downloading ' + note, query=q)['result'] def _real_extract(self, url): channel_code = self._match_id(url) - webpage = self._download_webpage( - 'http://channels.vlive.tv/%s/video' % channel_code, channel_code) + channel_seq = self._call_api( + 'decodeChannelCode', 'Code', channel_code, + 'decode channel code', {})['channelSeq'] - app_id = None - - app_js_url = self._search_regex( - r']+src=(["\'])(?Phttp.+?/app\.js.*?)\1', - webpage, 'app js', default=None, group='url') - - if app_js_url: - app_js = self._download_webpage( - app_js_url, channel_code, 'Downloading app JS', fatal=False) - if app_js: - app_id = self._search_regex( - r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]', - app_js, 'app id', default=None) - - app_id = app_id or self._APP_ID - - channel_info = self._download_json( - 'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode', - channel_code, note='Downloading decode channel code', - query={ - 'app_id': app_id, - 'channelCode': channel_code, - '_': int(time.time()) - }) - - channel_seq = channel_info['result']['channelSeq'] channel_name = None entries = [] for page_num in itertools.count(1): - video_list = self._download_json( - 'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList', - channel_code, note='Downloading channel list page #%d' % page_num, - query={ - 'app_id': app_id, - 'channelSeq': channel_seq, + video_list = self._call_api( + 'getChannelVideoList', 'Seq', channel_seq, + 'channel list page #%d' % page_num, { # Large values of maxNumOfRows (~300 or above) may cause # empty responses (see [1]), e.g. this happens for [2] that # has more than 300 videos. # 1. https://github.com/ytdl-org/youtube-dl/issues/13830 # 2. http://channels.vlive.tv/EDBF. 'maxNumOfRows': 100, - '_': int(time.time()), 'pageNo': page_num } ) @@ -269,99 +305,44 @@ def _real_extract(self, url): if not channel_name: channel_name = try_get( video_list, - lambda x: x['result']['channelInfo']['channelName'], + lambda x: x['channelInfo']['channelName'], compat_str) videos = try_get( - video_list, lambda x: x['result']['videoList'], list) + video_list, lambda x: x['videoList'], list) if not videos: break for video in videos: video_id = video.get('videoSeq') - if not video_id: + video_type = video.get('videoType') + + if not video_id or not video_type: continue video_id = compat_str(video_id) - entries.append( - self.url_result( - 'http://www.vlive.tv/video/%s' % video_id, - ie=VLiveIE.ie_key(), video_id=video_id)) + + if video_type in ('PLAYLIST'): + playlist_videos = try_get( + video, + lambda x: x['videoPlaylist']['videoList'], list) + if not playlist_videos: + continue + + for playlist_video in playlist_videos: + playlist_video_id = playlist_video.get('videoSeq') + if not playlist_video_id: + continue + playlist_video_id = compat_str(playlist_video_id) + + entries.append( + self.url_result( + 'http://www.vlive.tv/video/%s' % playlist_video_id, + ie=VLiveIE.ie_key(), video_id=playlist_video_id)) + else: + entries.append( + self.url_result( + 'http://www.vlive.tv/video/%s' % video_id, + ie=VLiveIE.ie_key(), video_id=video_id)) return self.playlist_result( entries, channel_code, channel_name) - - -class VLivePlaylistIE(InfoExtractor): - IE_NAME = 'vlive:playlist' - _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P[0-9]+)/playlist/(?P[0-9]+)' - _VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s' - _TESTS = [{ - # regular working playlist - 'url': 'https://www.vlive.tv/video/117956/playlist/117963', - 'info_dict': { - 'id': '117963', - 'title': '아이돌룸(IDOL ROOM) 41회 - (여자)아이들' - }, - 'playlist_mincount': 10 - }, { - # playlist with no playlistVideoSeqs - 'url': 'http://www.vlive.tv/video/22867/playlist/22912', - 'info_dict': { - 'id': '22867', - 'ext': 'mp4', - 'title': '[V LIVE] Valentine Day Message from MINA', - 'creator': 'TWICE', - 'view_count': int - }, - 'params': { - 'skip_download': True, - } - }] - - def _build_video_result(self, video_id, message): - self.to_screen(message) - return self.url_result( - self._VIDEO_URL_TEMPLATE % video_id, - ie=VLiveIE.ie_key(), video_id=video_id) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, playlist_id = mobj.group('video_id', 'id') - - if self._downloader.params.get('noplaylist'): - return self._build_video_result( - video_id, - 'Downloading just video %s because of --no-playlist' - % video_id) - - self.to_screen( - 'Downloading playlist %s - add --no-playlist to just download video' - % playlist_id) - - webpage = self._download_webpage( - 'http://www.vlive.tv/video/%s/playlist/%s' - % (video_id, playlist_id), playlist_id) - - raw_item_ids = self._search_regex( - r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage, - 'playlist video seqs', default=None, fatal=False) - - if not raw_item_ids: - return self._build_video_result( - video_id, - 'Downloading just video %s because no playlist was found' - % video_id) - - item_ids = self._parse_json(raw_item_ids, playlist_id) - - entries = [ - self.url_result( - self._VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(), - video_id=compat_str(item_id)) - for item_id in item_ids] - - playlist_name = self._html_search_regex( - r']+class="[^"]*multicam_playlist[^>]*>\s*]+>([^<]+)', - webpage, 'playlist title', fatal=False) - - return self.playlist_result(entries, playlist_id, playlist_name) diff --git a/youtube_dlc/extractor/xiami.py b/youtube_dlc/extractor/xiami.py index 618da8382..769aab331 100644 --- a/youtube_dlc/extractor/xiami.py +++ b/youtube_dlc/extractor/xiami.py @@ -54,17 +54,17 @@ def _extract_tracks(self, item_id, referer, typ=None): def _decrypt(origin): n = int(origin[0]) origin = origin[1:] - short_lenth = len(origin) // n - long_num = len(origin) - short_lenth * n + short_length = len(origin) // n + long_num = len(origin) - short_length * n l = tuple() for i in range(0, n): - length = short_lenth + length = short_length if i < long_num: length += 1 l += (origin[0:length], ) origin = origin[length:] ans = '' - for i in range(0, short_lenth + 1): + for i in range(0, short_length + 1): for j in range(0, n): if len(l[j]) > i: ans += l[j][i] diff --git a/youtube_dlc/extractor/xtube.py b/youtube_dlc/extractor/xtube.py index 01b253dcb..98d2adb99 100644 --- a/youtube_dlc/extractor/xtube.py +++ b/youtube_dlc/extractor/xtube.py @@ -38,22 +38,6 @@ class XTubeIE(InfoExtractor): 'comment_count': int, 'age_limit': 18, } - }, { - # FLV videos with duplicated formats - 'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752', - 'md5': 'a406963eb349dd43692ec54631efd88b', - 'info_dict': { - 'id': '9299752', - 'display_id': 'A-Super-Run-Part-1-YT', - 'ext': 'flv', - 'title': 'A Super Run - Part 1 (YT)', - 'description': 'md5:4cc3af1aa1b0413289babc88f0d4f616', - 'uploader': 'tshirtguy59', - 'duration': 579, - 'view_count': int, - 'comment_count': int, - 'age_limit': 18, - }, }, { # new URL schema 'url': 'http://www.xtube.com/video-watch/strange-erotica-625837', @@ -90,7 +74,7 @@ def _real_extract(self, url): title, thumbnail, duration = [None] * 3 config = self._parse_json(self._search_regex( - r'playerConf\s*=\s*({.+?})\s*,\s*\n', webpage, 'config', + r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config', default='{}'), video_id, transform_source=js_to_json, fatal=False) if config: config = config.get('mainRoll') diff --git a/youtube_dlc/extractor/youporn.py b/youtube_dlc/extractor/youporn.py index e7fca22de..7b9feafeb 100644 --- a/youtube_dlc/extractor/youporn.py +++ b/youtube_dlc/extractor/youporn.py @@ -29,7 +29,6 @@ class YouPornIE(InfoExtractor): 'upload_date': '20101217', 'average_rating': int, 'view_count': int, - 'comment_count': int, 'categories': list, 'tags': list, 'age_limit': 18, @@ -48,7 +47,6 @@ class YouPornIE(InfoExtractor): 'upload_date': '20110418', 'average_rating': int, 'view_count': int, - 'comment_count': int, 'categories': list, 'tags': list, 'age_limit': 18, @@ -156,7 +154,8 @@ def _real_extract(self, url): r'(?s)]+class=["\']submitByLink["\'][^>]*>(.+?)', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( - [r'Date\s+[Aa]dded:\s*([^<]+)', + [r'UPLOADED:\s*([^<]+)', + r'Date\s+[Aa]dded:\s*([^<]+)', r'(?s)]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)'], webpage, 'upload date', fatal=False)) @@ -171,7 +170,7 @@ def _real_extract(self, url): webpage, 'view count', fatal=False, group='count')) comment_count = str_to_int(self._search_regex( r'>All [Cc]omments? \(([\d,.]+)\)', - webpage, 'comment count', fatal=False)) + webpage, 'comment count', default=None)) def extract_tag_box(regex, title): tag_box = self._search_regex(regex, webpage, title, default=None) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 4fb49b864..e87692754 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -16,7 +16,6 @@ from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, - compat_HTTPError, compat_kwargs, compat_parse_qs, compat_urllib_parse_unquote, @@ -30,14 +29,11 @@ bool_or_none, clean_html, error_to_compat_str, - extract_attributes, ExtractorError, float_or_none, - get_element_by_attribute, get_element_by_id, int_or_none, mimetype2ext, - orderedSet, parse_codecs, parse_count, parse_duration, @@ -50,9 +46,11 @@ unescapeHTML, unified_strdate, unsmuggle_url, + update_url_query, uppercase_escape, url_or_none, urlencode_postdata, + urljoin, ) @@ -65,11 +63,16 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' + _RESERVED_NAMES = ( + r'course|embed|channel|c|user|playlist|watch|w|results|storefront|oops|' + r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|' + r'feed/(watch_later|history|subscriptions|library|trending|recommended)') + _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}' + _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' _YOUTUBE_CLIENT_HEADERS = { 'x-youtube-client-name': '1', @@ -274,11 +277,19 @@ def warn(message): def _download_webpage_handle(self, *args, **kwargs): query = kwargs.get('query', {}).copy() - query['disable_polymer'] = 'true' kwargs['query'] = query return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) + def _get_yt_initial_data(self, video_id, webpage): + config = self._search_regex( + (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});', + r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'), + webpage, 'ytInitialData', default=None) + if config: + return self._parse_json( + uppercase_escape(config), video_id, fatal=False) + def _real_initialize(self): if self._downloader is None: return @@ -286,93 +297,36 @@ def _real_initialize(self): if not self._login(): return + _DEFAULT_API_DATA = { + 'context': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20201021.03.00', + } + }, + } -class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): - # Extract entries from page with "Load more" button - def _entries(self, page, playlist_id): - more_widget_html = content_html = page - for page_num in itertools.count(1): - for entry in self._process_page(content_html): - yield entry + _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' - mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) - if not mobj: - break + def _call_api(self, ep, query, video_id): + data = self._DEFAULT_API_DATA.copy() + data.update(query) - count = 0 - retries = 3 - while count <= retries: - try: - # Downloading page may result in intermittent 5xx HTTP error - # that is usually worked around with a retry - more = self._download_json( - 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s%s' - % (page_num, ' (retry #%d)' % count if count else ''), - transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) - break - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): - count += 1 - if count <= retries: - continue - raise + response = self._download_json( + 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id, + note='Downloading API JSON', errnote='Unable to download API page', + data=json.dumps(data).encode('utf8'), + headers={'content-type': 'application/json'}, + query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'}) - content_html = more['content_html'] - if not content_html.strip(): - # Some webpages show a "Load more" button but they don't - # have more videos - break - more_widget_html = more['load_more_widget_html'] + return response - -class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for video_id, video_title in self.extract_videos_from_page(content): - yield self.url_result(video_id, 'Youtube', video_id, video_title) - - def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page): - for mobj in re.finditer(video_re, page): - # The link with index 0 is not the first video of the playlist (not sure if still actual) - if 'index' in mobj.groupdict() and mobj.group('id') == '0': - continue - video_id = mobj.group('id') - video_title = unescapeHTML( - mobj.group('title')) if 'title' in mobj.groupdict() else None - if video_title: - video_title = video_title.strip() - if video_title == '► Play all': - video_title = None - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) - - -class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for playlist_id in orderedSet(re.findall( - r']+class="[^"]*yt-lockup-title[^"]*"[^>]*>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', - content)): - yield self.url_result( - 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - title = self._og_search_title(webpage, fatal=False) - return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title) + def _extract_yt_initial_data(self, video_id, webpage): + return self._parse_json( + self._search_regex( + (r'%s\s*\n' % self._YT_INITIAL_DATA_RE, + self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'), + video_id) class YoutubeIE(YoutubeBaseInfoExtractor): @@ -433,7 +387,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) )? # all until now is optional -> you can pass the naked ID - ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID + (?P[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID (?!.*?\blist= (?: %(playlist_id)s| # combined list/video URLs are handled by the playlist IE @@ -597,7 +551,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } }, { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ', 'note': 'Use the first video ID in the URL', 'info_dict': { 'id': 'BaW_jenozKc', @@ -638,6 +592,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'skip': 'format 141 not served anymore', }, + # DASH manifest with encrypted signature + { + 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA', + 'info_dict': { + 'id': 'IB3lcPjvWLA', + 'ext': 'm4a', + 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', + 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', + 'duration': 244, + 'uploader': 'AfrojackVEVO', + 'uploader_id': 'AfrojackVEVO', + 'upload_date': '20131011', + }, + 'params': { + 'youtube_include_dash_manifest': True, + 'format': '141/bestaudio[ext=m4a]', + }, + }, # Controversy video { 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', @@ -669,6 +641,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'age_limit': 18, }, }, + # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) + # YouTube Red ad is not captured for creator + { + 'url': '__2ABJjxzNo', + 'info_dict': { + 'id': '__2ABJjxzNo', + 'ext': 'mp4', + 'duration': 266, + 'upload_date': '20100430', + 'uploader_id': 'deadmau5', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', + 'creator': 'Dada Life, deadmau5', + 'description': 'md5:12c56784b8032162bb936a5f76d55360', + 'uploader': 'deadmau5', + 'title': 'Deadmau5 - Some Chords (HD)', + 'alt_title': 'This Machine Kills Some Chords', + }, + 'expected_warnings': [ + 'DASH manifest missing', + ] + }, # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) { 'url': 'lqQg6PlCWgI', @@ -1007,10 +1000,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'sJL6WA-aGkQ', 'only_matching': True, }, - { - 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', - 'only_matching': True, - }, { 'url': 'https://invidio.us/watch?v=BaW_jenozKc', 'only_matching': True, @@ -1062,73 +1051,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, - { - # Youtube Music Auto-generated description - # Retrieve 'artist' field from 'Artist:' in video description - # when it is present on youtube music video - 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY', - 'info_dict': { - 'id': 'k0jLE7tTwjY', - 'ext': 'mp4', - 'title': 'Latch Feat. Sam Smith', - 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335', - 'upload_date': '20150110', - 'uploader': 'Various Artists - Topic', - 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w', - 'artist': 'Disclosure', - 'track': 'Latch Feat. Sam Smith', - 'album': 'Latch Featuring Sam Smith', - 'release_date': '20121008', - 'release_year': 2012, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # handle multiple artists on youtube music video - 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA', - 'info_dict': { - 'id': '74qn0eJSjpA', - 'ext': 'mp4', - 'title': 'Eastside', - 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2', - 'upload_date': '20180710', - 'uploader': 'Benny Blanco - Topic', - 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A', - 'artist': 'benny blanco, Halsey, Khalid', - 'track': 'Eastside', - 'album': 'Eastside', - 'release_date': '20180713', - 'release_year': 2018, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # handle youtube music video with release_year and no release_date - 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M', - 'info_dict': { - 'id': '-hcAI0g-f5M', - 'ext': 'mp4', - 'title': 'Put It On Me', - 'description': 'md5:f6422397c07c4c907c6638e1fee380a5', - 'upload_date': '20180426', - 'uploader': 'Matt Maeson - Topic', - 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', - 'artist': 'Matt Maeson', - 'track': 'Put It On Me', - 'album': 'The Hearse', - 'release_date': None, - 'release_year': 2018, - }, - 'params': { - 'skip_download': True, - }, - }, { 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q', 'only_matching': True, @@ -1169,6 +1091,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + # with '};' inside yt initial data (see https://github.com/ytdl-org/youtube-dl/issues/27093) + 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no', + 'info_dict': { + 'id': 'CHqg6qOn4no', + 'ext': 'mp4', + 'title': 'Part 77 Sort a list of simple types in c#', + 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc', + 'upload_date': '20130831', + 'uploader_id': 'kudvenkat', + 'uploader': 'kudvenkat', + }, + 'params': { + 'skip_download': True, + }, + }, ] def __init__(self, *args, **kwargs): @@ -1397,15 +1335,6 @@ def _get_ytplayer_config(self, video_id, webpage): return self._parse_json( uppercase_escape(config), video_id, fatal=False) - def _get_yt_initial_data(self, video_id, webpage): - config = self._search_regex( - (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});', - r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'), - webpage, 'ytInitialData', default=None) - if config: - return self._parse_json( - uppercase_escape(config), video_id, fatal=False) - def _get_automatic_captions(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" @@ -1481,21 +1410,14 @@ def make_captions(sub_url, sub_langs): player_response, video_id, fatal=False) if player_response: renderer = player_response['captions']['playerCaptionsTracklistRenderer'] - caption_tracks = renderer['captionTracks'] - for caption_track in caption_tracks: - if 'kind' not in caption_track: - # not an automatic transcription - continue - base_url = caption_track['baseUrl'] - sub_lang_list = [] - for lang in renderer['translationLanguages']: - lang_code = lang.get('languageCode') - if lang_code: - sub_lang_list.append(lang_code) - return make_captions(base_url, sub_lang_list) + base_url = renderer['captionTracks'][0]['baseUrl'] + sub_lang_list = [] + for lang in renderer['translationLanguages']: + lang_code = lang.get('languageCode') + if lang_code: + sub_lang_list.append(lang_code) + return make_captions(base_url, sub_lang_list) - self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id) - return {} # Some videos don't provide ttsurl but rather caption_tracks and # caption_translation_languages (e.g. 20LmZk1hakA) # Does not used anymore as of 22.06.2017 @@ -1589,15 +1511,11 @@ def extract_id(cls, url): def _extract_chapters_from_json(self, webpage, video_id, duration): if not webpage: return - initial_data = self._parse_json( - self._search_regex( - r'window\["ytInitialData"\] = (.+);\n', webpage, - 'player args', default='{}'), - video_id, fatal=False) - if not initial_data or not isinstance(initial_data, dict): + data = self._extract_yt_initial_data(video_id, webpage) + if not data or not isinstance(data, dict): return chapters_list = try_get( - initial_data, + data, lambda x: x['playerOverlays'] ['playerOverlayRenderer'] ['decoratedPlayerBarRenderer'] @@ -1784,21 +1702,24 @@ def extract_embedded_config(embed_webpage, video_id): # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: - args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): - # Convert to the same format returned by compat_parse_qs - video_info = dict((k, [v]) for k, v in args.items()) - add_dash_mpd(video_info) - # Rental video is not rented but preview is available (e.g. - # https://www.youtube.com/watch?v=yYr8q0y5Jfg, - # https://github.com/ytdl-org/youtube-dl/issues/10532) - if not video_info and args.get('ypc_vid'): - return self.url_result( - args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) - if args.get('livestream') == '1' or args.get('live_playback') == 1: - is_live = True - if not player_response: - player_response = extract_player_response(args.get('player_response'), video_id) + args = ytplayer_config.get("args") + if args is not None: + if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): + # Convert to the same format returned by compat_parse_qs + video_info = dict((k, [v]) for k, v in args.items()) + add_dash_mpd(video_info) + # Rental video is not rented but preview is available (e.g. + # https://www.youtube.com/watch?v=yYr8q0y5Jfg, + # https://github.com/ytdl-org/youtube-dl/issues/10532) + if not video_info and args.get('ypc_vid'): + return self.url_result( + args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) + if args.get('livestream') == '1' or args.get('live_playback') == 1: + is_live = True + if not player_response: + player_response = extract_player_response(args.get('player_response'), video_id) + elif not player_response: + player_response = ytplayer_config if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): add_dash_mpd_pr(player_response) else: @@ -1829,7 +1750,7 @@ def extract_embedded_config(embed_webpage, video_id): # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: - args = ytplayer_config['args'] + args = ytplayer_config.get('args', {}) if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) @@ -1847,6 +1768,13 @@ def extract_embedded_config(embed_webpage, video_id): if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): add_dash_mpd_pr(player_response) + if not video_info and not player_response: + player_response = extract_player_response( + self._search_regex( + r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage, + 'initial player response', default='{}'), + video_id) + def extract_unavailable_message(): messages = [] for tag, kind in (('h1', 'message'), ('div', 'submessage')): @@ -2051,7 +1979,10 @@ def _extract_filesize(media_url): if cipher: if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' + ASSETS_RE = ( + r']+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base', + r'"jsUrl"\s*:\s*("[^"]+")', + r'"assets":.+?"js":\s*("[^"]+")') jsplayer_url_json = self._search_regex( ASSETS_RE, embed_webpage if age_gate else video_webpage, @@ -2186,6 +2117,21 @@ def _extract_filesize(media_url): formats.append(a_format) else: error_message = extract_unavailable_message() + if not error_message: + reason_list = try_get( + player_response, + lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'], + list) or [] + for reason in reason_list: + if not isinstance(reason, dict): + continue + reason_text = try_get(reason, lambda x: x['text'], compat_str) + if reason_text: + if not error_message: + error_message = '' + error_message += reason_text + if error_message: + error_message = clean_html(error_message) if not error_message: error_message = clean_html(try_get( player_response, lambda x: x['playabilityStatus']['reason'], @@ -2311,7 +2257,7 @@ def extract_meta(field): # Youtube Music Auto-generated description release_date = release_year = None if video_description: - mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P[^·]+)·(?P[^\n]+)\n+(?P[^\n]+)(?:.+?℗\s*(?P\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P[^\n]+))?', video_description) + mobj = re.search(r'(?s)(?P[^·\n]+)·(?P[^\n]+)\n+(?P[^\n]+)(?:.+?℗\s*(?P\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) if mobj: if not track: track = mobj.group('track').strip() @@ -2328,6 +2274,34 @@ def extract_meta(field): if release_year: release_year = int(release_year) + yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage) + contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or [] + for content in contents: + rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or [] + multiple_songs = False + for row in rows: + if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: + multiple_songs = True + break + for row in rows: + mrr = row.get('metadataRowRenderer') or {} + mrr_title = try_get( + mrr, lambda x: x['title']['simpleText'], compat_str) + mrr_contents = try_get( + mrr, lambda x: x['contents'][0], dict) or {} + mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str) + if not (mrr_title and mrr_contents_text): + continue + if mrr_title == 'License': + video_license = mrr_contents_text + elif not multiple_songs: + if mrr_title == 'Album': + album = mrr_contents_text + elif mrr_title == 'Artist': + artist = mrr_contents_text + elif mrr_title == 'Song': + track = mrr_contents_text + m_episode = re.search( r']+id="watch7-headline"[^>]*>\s*]*>.*?>(?P[^<]+)\s*S(?P\d+)\s*•\s*E(?P\d+)', video_webpage) @@ -2359,8 +2333,8 @@ def extract_meta(field): def _extract_count(count_name): return str_to_int(self._search_regex( - r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}' - % re.escape(count_name), + (r'-%s-button[^>]+>]+class="yt-uix-button-content"[^>]*>([\d,]+)' % re.escape(count_name), + r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)), video_webpage, count_name, default=None)) like_count = _extract_count('like') @@ -2537,38 +2511,59 @@ def decrypt_sig(mobj): } -class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): - IE_DESC = 'YouTube.com playlists' - _VALID_URL = r"""(?x)(?: - (?:https?://)? +class YoutubeTabIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube.com tab' + _VALID_URL = r'''(?x) + https?:// (?:\w+\.)? (?: - (?: - youtube(?:kids)?\.com| - invidio\.us - ) - / - (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) - \? (?:.*?[&;])*? (?:p|a|list)= - | p/ + youtube(?:kids)?\.com| + invidio\.us + )/ + (?: + (?:channel|c|user)/| + (?P + feed/| + (?:playlist|watch)\?.*?\blist= )| - youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= + (?!(%s)([/#?]|$)) # Direct URLs ) - ( - (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,} - # Top tracks, they can also include dots - |(?:MC)[\w\.]* - ) - .* - | - (%(playlist_id)s) - )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P\d+))?(?:[^>]+>(?P[^<]+))?)?' - _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})' - IE_NAME = 'youtube:playlist' + (?P<id>[^/?\#&]+) + ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES + IE_NAME = 'youtube:tab' + _TESTS = [{ + # playlists, multipage + 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', + 'playlist_mincount': 94, + 'info_dict': { + 'id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'title': 'Игорь Клейнер - Playlists', + 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', + }, + }, { + # playlists, multipage, different order + 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', + 'playlist_mincount': 94, + 'info_dict': { + 'id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'title': 'Игорь Клейнер - Playlists', + 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', + }, + }, { + # playlists, singlepage + 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', + 'playlist_mincount': 4, + 'info_dict': { + 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', + 'title': 'ThirstForScience - Playlists', + 'description': 'md5:609399d937ea957b0f53cbffb747a14c', + } + }, { + 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', + 'only_matching': True, + }, { + # basic, single video playlist 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'info_dict': { 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', @@ -2578,6 +2573,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): }, 'playlist_count': 1, }, { + # empty playlist 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'info_dict': { 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', @@ -2586,6 +2582,69 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'title': 'youtube-dl empty playlist', }, 'playlist_count': 0, + }, { + # Home tab + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Home', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + }, + 'playlist_mincount': 2, + }, { + # Videos tab + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + }, + 'playlist_mincount': 975, + }, { + # Videos tab, sorted by popular + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + }, + 'playlist_mincount': 199, + }, { + # Playlists tab + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Playlists', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + }, + 'playlist_mincount': 17, + }, { + # Community tab + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Community', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + }, + 'playlist_mincount': 18, + }, { + # Channels tab + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Channels', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + }, + 'playlist_mincount': 138, + }, { + 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', @@ -2593,19 +2652,9 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'title': '29C3: Not my department', 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'uploader': 'Christiaan008', - 'uploader_id': 'ChRiStIaAn008', + 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg', }, 'playlist_count': 96, - }, { - 'note': 'issue #673', - 'url': 'PLBB231211A4F62143', - 'info_dict': { - 'title': '[OLD]Team Fortress 2 (Class-based LP)', - 'id': 'PLBB231211A4F62143', - 'uploader': 'Wickydoo', - 'uploader_id': 'Wickydoo', - }, - 'playlist_mincount': 26, }, { 'note': 'Large playlist', 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', @@ -2613,45 +2662,13 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'title': 'Uploads from Cauchemar', 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', 'uploader': 'Cauchemar', - 'uploader_id': 'Cauchemar89', + 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', }, - 'playlist_mincount': 799, + 'playlist_mincount': 1123, }, { - 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', - 'info_dict': { - 'title': 'YDL_safe_search', - 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', - }, - 'playlist_count': 2, - 'skip': 'This playlist is private', - }, { - 'note': 'embedded', - 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - 'playlist_count': 4, - 'info_dict': { - 'title': 'JODA15', - 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - 'uploader': 'milan', - 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw', - } - }, { - 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'playlist_mincount': 485, - 'info_dict': { - 'title': '2018 Chinese New Singles (11/6 updated)', - 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'uploader': 'LBK', - 'uploader_id': 'sdragonfang', - } - }, { - 'note': 'Embedded SWF player', - 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', - 'playlist_count': 4, - 'info_dict': { - 'title': 'JODA7', - 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ', - }, - 'skip': 'This playlist does not exist', + # even larger playlist, 8832 videos + 'url': 'http://www.youtube.com/user/NASAgovVideo/videos', + 'only_matching': True, }, { 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', @@ -2659,9 +2676,22 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'title': 'Uploads from Interstellar Movie', 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', 'uploader': 'Interstellar Movie', - 'uploader_id': 'InterstellarMovie1', + 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA', }, 'playlist_mincount': 21, + }, { + # https://github.com/ytdl-org/youtube-dl/issues/21844 + 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'info_dict': { + 'title': 'Data Analysis with Dr Mike Pound', + 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', + 'uploader': 'Computerphile', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'only_matching': True, }, { # Playlist URL that does not actually serve a playlist 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', @@ -2686,6 +2716,639 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): }, 'skip': 'This video is not available.', 'add_ie': [YoutubeIE.ie_key()], + }, { + 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', + 'info_dict': { + 'id': '9Auq9mYxFEE', + 'ext': 'mp4', + 'title': 'Watch Sky News live', + 'uploader': 'Sky News', + 'uploader_id': 'skynews', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', + 'upload_date': '20191102', + 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662', + 'categories': ['News & Politics'], + 'tags': list, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.youtube.com/user/TheYoungTurks/live', + 'info_dict': { + 'id': 'a48o2S1cPoo', + 'ext': 'mp4', + 'title': 'The Young Turks - Live Main Show', + 'uploader': 'The Young Turks', + 'uploader_id': 'TheYoungTurks', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', + 'upload_date': '20150715', + 'license': 'Standard YouTube License', + 'description': 'md5:438179573adcdff3c97ebb1ee632b891', + 'categories': ['News & Politics'], + 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/trending', + 'only_matching': True, + }, { + # needs auth + 'url': 'https://www.youtube.com/feed/library', + 'only_matching': True, + }, { + # needs auth + 'url': 'https://www.youtube.com/feed/history', + 'only_matching': True, + }, { + # needs auth + 'url': 'https://www.youtube.com/feed/subscriptions', + 'only_matching': True, + }, { + # needs auth + 'url': 'https://www.youtube.com/feed/watch_later', + 'only_matching': True, + }, { + # no longer available? + 'url': 'https://www.youtube.com/feed/recommended', + 'only_matching': True, + } + # TODO + # { + # 'url': 'https://www.youtube.com/TheYoungTurks/live', + # 'only_matching': True, + # } + ] + + def _extract_channel_id(self, webpage): + channel_id = self._html_search_meta( + 'channelId', webpage, 'channel id', default=None) + if channel_id: + return channel_id + channel_url = self._html_search_meta( + ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url', + 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad', + 'twitter:app:url:googleplay'), webpage, 'channel url') + return self._search_regex( + r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+', + channel_url, 'channel id') + + @staticmethod + def _extract_grid_item_renderer(item): + for item_kind in ('Playlist', 'Video', 'Channel'): + renderer = item.get('grid%sRenderer' % item_kind) + if renderer: + return renderer + + def _extract_video(self, renderer): + video_id = renderer.get('videoId') + title = try_get( + renderer, + (lambda x: x['title']['runs'][0]['text'], + lambda x: x['title']['simpleText']), compat_str) + description = try_get( + renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'], + compat_str) + duration = parse_duration(try_get( + renderer, lambda x: x['lengthText']['simpleText'], compat_str)) + view_count_text = try_get( + renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or '' + view_count = str_to_int(self._search_regex( + r'^([\d,]+)', re.sub(r'\s', '', view_count_text), + 'view count', default=None)) + uploader = try_get( + renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str) + return { + '_type': 'url_transparent', + 'ie_key': YoutubeIE.ie_key(), + 'id': video_id, + 'url': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'uploader': uploader, + } + + def _grid_entries(self, grid_renderer): + for item in grid_renderer['items']: + if not isinstance(item, dict): + continue + renderer = self._extract_grid_item_renderer(item) + if not isinstance(renderer, dict): + continue + title = try_get( + renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + # playlist + playlist_id = renderer.get('playlistId') + if playlist_id: + yield self.url_result( + 'https://www.youtube.com/playlist?list=%s' % playlist_id, + ie=YoutubeTabIE.ie_key(), video_id=playlist_id, + video_title=title) + # video + video_id = renderer.get('videoId') + if video_id: + yield self._extract_video(renderer) + # channel + channel_id = renderer.get('channelId') + if channel_id: + title = try_get( + renderer, lambda x: x['title']['simpleText'], compat_str) + yield self.url_result( + 'https://www.youtube.com/channel/%s' % channel_id, + ie=YoutubeTabIE.ie_key(), video_title=title) + + def _shelf_entries_from_content(self, shelf_renderer): + content = shelf_renderer.get('content') + if not isinstance(content, dict): + return + renderer = content.get('gridRenderer') + if renderer: + # TODO: add support for nested playlists so each shelf is processed + # as separate playlist + # TODO: this includes only first N items + for entry in self._grid_entries(renderer): + yield entry + renderer = content.get('horizontalListRenderer') + if renderer: + # TODO + pass + + def _shelf_entries(self, shelf_renderer): + ep = try_get( + shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], + compat_str) + shelf_url = urljoin('https://www.youtube.com', ep) + if shelf_url: + title = try_get( + shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + yield self.url_result(shelf_url, video_title=title) + # Shelf may not contain shelf URL, fallback to extraction from content + for entry in self._shelf_entries_from_content(shelf_renderer): + yield entry + + def _playlist_entries(self, video_list_renderer): + for content in video_list_renderer['contents']: + if not isinstance(content, dict): + continue + renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer') + if not isinstance(renderer, dict): + continue + video_id = renderer.get('videoId') + if not video_id: + continue + yield self._extract_video(renderer) + + r""" # Not needed in the new implementation + def _itemSection_entries(self, item_sect_renderer): + for content in item_sect_renderer['contents']: + if not isinstance(content, dict): + continue + renderer = content.get('videoRenderer', {}) + if not isinstance(renderer, dict): + continue + video_id = renderer.get('videoId') + if not video_id: + continue + yield self._extract_video(renderer) + """ + + def _rich_entries(self, rich_grid_renderer): + renderer = try_get( + rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {} + video_id = renderer.get('videoId') + if not video_id: + return + yield self._extract_video(renderer) + + def _video_entry(self, video_renderer): + video_id = video_renderer.get('videoId') + if video_id: + return self._extract_video(video_renderer) + + def _post_thread_entries(self, post_thread_renderer): + post_renderer = try_get( + post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict) + if not post_renderer: + return + # video attachment + video_renderer = try_get( + post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) + video_id = None + if video_renderer: + entry = self._video_entry(video_renderer) + if entry: + yield entry + # inline video links + runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or [] + for run in runs: + if not isinstance(run, dict): + continue + ep_url = try_get( + run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str) + if not ep_url: + continue + if not YoutubeIE.suitable(ep_url): + continue + ep_video_id = YoutubeIE._match_id(ep_url) + if video_id == ep_video_id: + continue + yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id) + + def _post_thread_continuation_entries(self, post_thread_continuation): + contents = post_thread_continuation.get('contents') + if not isinstance(contents, list): + return + for content in contents: + renderer = content.get('backstagePostThreadRenderer') + if not isinstance(renderer, dict): + continue + for entry in self._post_thread_entries(renderer): + yield entry + + @staticmethod + def _extract_next_continuation_data(renderer): + next_continuation = try_get( + renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict) + if not next_continuation: + return + continuation = next_continuation.get('continuation') + if not continuation: + return + ctp = next_continuation.get('clickTrackingParams') + return { + 'ctoken': continuation, + 'continuation': continuation, + 'itct': ctp, + } + + @classmethod + def _extract_continuation(cls, renderer): + next_continuation = cls._extract_next_continuation_data(renderer) + if next_continuation: + return next_continuation + contents = renderer.get('contents') + if not isinstance(contents, list): + return + for content in contents: + if not isinstance(content, dict): + continue + continuation_ep = try_get( + content, lambda x: x['continuationItemRenderer']['continuationEndpoint'], + dict) + if not continuation_ep: + continue + continuation = try_get( + continuation_ep, lambda x: x['continuationCommand']['token'], compat_str) + if not continuation: + continue + ctp = continuation_ep.get('clickTrackingParams') + if not ctp: + continue + return { + 'ctoken': continuation, + 'continuation': continuation, + 'itct': ctp, + } + + def _entries(self, tab, identity_token): + + def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds + contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] + for content in contents: + if not isinstance(content, dict): + continue + is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict) + if not is_renderer: + renderer = content.get('richItemRenderer') + if renderer: + for entry in self._rich_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(parent_renderer) + continue + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] + for isr_content in isr_contents: + if not isinstance(isr_content, dict): + continue + renderer = isr_content.get('playlistVideoListRenderer') + if renderer: + for entry in self._playlist_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(renderer) + continue + renderer = isr_content.get('gridRenderer') + if renderer: + for entry in self._grid_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(renderer) + continue + renderer = isr_content.get('shelfRenderer') + if renderer: + for entry in self._shelf_entries(renderer): + yield entry + continue + renderer = isr_content.get('backstagePostThreadRenderer') + if renderer: + for entry in self._post_thread_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(renderer) + continue + renderer = isr_content.get('videoRenderer') + if renderer: + entry = self._video_entry(renderer) + if entry: + yield entry + + if not continuation_list[0]: + continuation_list[0] = self._extract_continuation(is_renderer) + + if not continuation_list[0]: + continuation_list[0] = self._extract_continuation(parent_renderer) + + continuation_list = [None] # Python 2 doesnot support nonlocal + parent_renderer = ( + try_get(tab, lambda x: x['sectionListRenderer'], dict) + or try_get(tab, lambda x: x['richGridRenderer'], dict) or {}) + for entry in extract_entries(parent_renderer): + yield entry + continuation = continuation_list[0] + + headers = { + 'x-youtube-client-name': '1', + 'x-youtube-client-version': '2.20201112.04.01', + } + if identity_token: + headers['x-youtube-identity-token'] = identity_token + + for page_num in itertools.count(1): + if not continuation: + break + browse = self._download_json( + 'https://www.youtube.com/browse_ajax', None, + 'Downloading page %d' % page_num, + headers=headers, query=continuation, fatal=False) + if not browse: + break + response = try_get(browse, lambda x: x[1]['response'], dict) + if not response: + break + + continuation_contents = try_get( + response, lambda x: x['continuationContents'], dict) + if continuation_contents: + continuation_renderer = continuation_contents.get('playlistVideoListContinuation') + if continuation_renderer: + for entry in self._playlist_entries(continuation_renderer): + yield entry + continuation = self._extract_continuation(continuation_renderer) + continue + continuation_renderer = continuation_contents.get('gridContinuation') + if continuation_renderer: + for entry in self._grid_entries(continuation_renderer): + yield entry + continuation = self._extract_continuation(continuation_renderer) + continue + continuation_renderer = continuation_contents.get('itemSectionContinuation') + if continuation_renderer: + for entry in self._post_thread_continuation_entries(continuation_renderer): + yield entry + continuation = self._extract_continuation(continuation_renderer) + continue + continuation_renderer = continuation_contents.get('sectionListContinuation') # for feeds + if continuation_renderer: + continuation_list = [None] + for entry in extract_entries(continuation_renderer): + yield entry + continuation = continuation_list[0] + continue + + continuation_items = try_get( + response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list) + if continuation_items: + continuation_item = continuation_items[0] + if not isinstance(continuation_item, dict): + continue + renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer') + if renderer: + video_list_renderer = {'contents': continuation_items} + for entry in self._playlist_entries(video_list_renderer): + yield entry + continuation = self._extract_continuation(video_list_renderer) + continue + break + + @staticmethod + def _extract_selected_tab(tabs): + for tab in tabs: + if try_get(tab, lambda x: x['tabRenderer']['selected'], bool): + return tab['tabRenderer'] + else: + raise ExtractorError('Unable to find selected tab') + + @staticmethod + def _extract_uploader(data): + uploader = {} + sidebar_renderer = try_get( + data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) + if sidebar_renderer: + for item in sidebar_renderer: + if not isinstance(item, dict): + continue + renderer = item.get('playlistSidebarSecondaryInfoRenderer') + if not isinstance(renderer, dict): + continue + owner = try_get( + renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict) + if owner: + uploader['uploader'] = owner.get('text') + uploader['uploader_id'] = try_get( + owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str) + uploader['uploader_url'] = urljoin( + 'https://www.youtube.com/', + try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str)) + return uploader + + def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): + selected_tab = self._extract_selected_tab(tabs) + renderer = try_get( + data, lambda x: x['metadata']['channelMetadataRenderer'], dict) + playlist_id = title = description = None + if renderer: + channel_title = renderer.get('title') or item_id + tab_title = selected_tab.get('title') + title = channel_title or item_id + if tab_title: + title += ' - %s' % tab_title + description = renderer.get('description') + playlist_id = renderer.get('externalId') + renderer = try_get( + data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) + if renderer: + title = renderer.get('title') + description = None + playlist_id = item_id + if playlist_id is None: + playlist_id = item_id + if title is None: + title = "Youtube " + playlist_id.title() + playlist = self.playlist_result( + self._entries(selected_tab['content'], identity_token), + playlist_id=playlist_id, playlist_title=title, + playlist_description=description) + playlist.update(self._extract_uploader(data)) + return playlist + + def _extract_from_playlist(self, item_id, data, playlist): + title = playlist.get('title') or try_get( + data, lambda x: x['titleText']['simpleText'], compat_str) + playlist_id = playlist.get('playlistId') or item_id + return self.playlist_result( + self._playlist_entries(playlist), playlist_id=playlist_id, + playlist_title=title) + + def _extract_alerts(self, data): + for alert_dict in try_get(data, lambda x: x['alerts'], list) or []: + for renderer in alert_dict: + alert = alert_dict[renderer] + alert_type = alert.get('type') + if not alert_type: + continue + message = try_get(alert, lambda x: x['text']['simpleText'], compat_str) + if message: + yield alert_type, message + for run in try_get(alert, lambda x: x['text']['runs'], list) or []: + message = try_get(run, lambda x: x['text'], compat_str) + if message: + yield alert_type, message + + def _real_extract(self, url): + item_id = self._match_id(url) + url = compat_urlparse.urlunparse( + compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) + is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url) + if is_home is not None and is_home.group('not_channel') is None and item_id != 'feed': + self._downloader.report_warning( + 'A channel/user page was given. All the channel\'s videos will be downloaded. ' + 'To download only the videos in the home page, add a "/home" to the URL') + url = '%s/videos%s' % (is_home.group('pre'), is_home.group('post') or '') + + # Handle both video/playlist URLs + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + video_id = qs.get('v', [None])[0] + playlist_id = qs.get('list', [None])[0] + + if is_home.group('not_channel') is not None and is_home.group('not_channel').startswith('watch') and not video_id: + if playlist_id: + self._downloader.report_warning('%s is not a valid Youtube URL. Trying to download playlist %s' % (url, playlist_id)) + url = 'https://www.youtube.com/playlist?list=%s' % playlist_id + # return self.url_result(playlist_id, ie=YoutubePlaylistIE.ie_key()) + else: + raise ExtractorError('Unable to recognize tab page') + if video_id and playlist_id: + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + + webpage = self._download_webpage(url, item_id) + identity_token = self._search_regex( + r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, + 'identity token', default=None) + data = self._extract_yt_initial_data(item_id, webpage) + for alert_type, alert_message in self._extract_alerts(data): + self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message)) + tabs = try_get( + data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) + if tabs: + return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token) + playlist = try_get( + data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) + if playlist: + return self._extract_from_playlist(item_id, data, playlist) + # Fallback to video extraction if no playlist alike page is recognized. + # First check for the current video then try the v attribute of URL query. + video_id = try_get( + data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'], + compat_str) or video_id + if video_id: + return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) + # Failed to recognize + raise ExtractorError('Unable to recognize tab page') + + +class YoutubePlaylistIE(InfoExtractor): + IE_DESC = 'YouTube.com playlists' + _VALID_URL = r'''(?x)(?: + (?:https?://)? + (?:\w+\.)? + (?: + (?: + youtube(?:kids)?\.com| + invidio\.us| + youtu\.be + ) + /.*?\?.*?\blist= + )? + (?P<id>%(playlist_id)s) + )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} + IE_NAME = 'youtube:playlist' + _TESTS = [{ + 'note': 'issue #673', + 'url': 'PLBB231211A4F62143', + 'info_dict': { + 'title': '[OLD]Team Fortress 2 (Class-based LP)', + 'id': 'PLBB231211A4F62143', + 'uploader': 'Wickydoo', + 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', + }, + 'playlist_mincount': 29, + }, { + 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', + 'info_dict': { + 'title': 'YDL_safe_search', + 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', + }, + 'playlist_count': 2, + 'skip': 'This playlist is private', + }, { + 'note': 'embedded', + 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'playlist_count': 4, + 'info_dict': { + 'title': 'JODA15', + 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'uploader': 'milan', + 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw', + } + }, { + 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + 'playlist_mincount': 982, + 'info_dict': { + 'title': '2018 Chinese New Singles (11/6 updated)', + 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + 'uploader': 'LBK', + 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA', + } }, { 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', 'info_dict': { @@ -2706,16 +3369,6 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'noplaylist': True, 'skip_download': True, }, - }, { - # https://github.com/ytdl-org/youtube-dl/issues/21844 - 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'info_dict': { - 'title': 'Data Analysis with Dr Mike Pound', - 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'uploader_id': 'Computerphile', - 'uploader': 'Computerphile', - }, - 'playlist_mincount': 11, }, { 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', 'only_matching': True, @@ -2726,431 +3379,57 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): # music album playlist 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', 'only_matching': True, - }, { - 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', - 'only_matching': True, - }, { - 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', - 'only_matching': True, - }] - - def _real_initialize(self): - self._login() - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - - for item in re.findall( - r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page): - attrs = extract_attributes(item) - video_id = attrs['data-video-id'] - video_title = unescapeHTML(attrs.get('data-title')) - if video_title: - video_title = video_title.strip() - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - # Fallback with old _VIDEO_RE - self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) - - # Relaxed fallbacks - self.extract_videos_from_page_impl( - r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page, - ids_in_page, titles_in_page) - self.extract_videos_from_page_impl( - r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page, - ids_in_page, titles_in_page) - - return zip(ids_in_page, titles_in_page) - - def _extract_mix(self, playlist_id): - # The mixes are generated from a single video - # the id of the playlist is just 'RD' + video_id - ids = [] - last_id = playlist_id[-11:] - for n in itertools.count(1): - url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) - webpage = self._download_webpage( - url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) - new_ids = orderedSet(re.findall( - r'''(?xs)data-video-username=".*?".*? - href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), - webpage)) - # Fetch new pages until all the videos are repeated, it seems that - # there are always 51 unique videos. - new_ids = [_id for _id in new_ids if _id not in ids] - if not new_ids: - break - ids.extend(new_ids) - last_id = ids[-1] - - url_results = self._ids_to_results(ids) - - search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) - title_span = ( - search_title('playlist-title') - or search_title('title long-title') - or search_title('title')) - title = clean_html(title_span) - - return self.playlist_result(url_results, playlist_id, title) - - def _extract_playlist(self, playlist_id): - url = self._TEMPLATE_URL % playlist_id - page = self._download_webpage(url, playlist_id) - - # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604) - for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page): - match = match.strip() - # Check if the playlist exists or is private - mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match) - if mobj: - reason = mobj.group('reason') - message = 'This playlist %s' % reason - if 'private' in reason: - message += ', use --username or --netrc to access it' - message += '.' - raise ExtractorError(message, expected=True) - elif re.match(r'[^<]*Invalid parameters[^<]*', match): - raise ExtractorError( - 'Invalid parameters. Maybe URL is incorrect.', - expected=True) - elif re.match(r'[^<]*Choose your language[^<]*', match): - continue - else: - self.report_warning('Youtube gives an alert message: ' + match) - - playlist_title = self._html_search_regex( - r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>', - page, 'title', default=None) - - _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref=' - uploader = self._html_search_regex( - r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, - page, 'uploader', default=None) - mobj = re.search( - r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE, - page) - if mobj: - uploader_id = mobj.group('uploader_id') - uploader_url = compat_urlparse.urljoin(url, mobj.group('path')) - else: - uploader_id = uploader_url = None - - has_videos = True - - if not playlist_title: - try: - # Some playlist URLs don't actually serve a playlist (e.g. - # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4) - next(self._entries(page, playlist_id)) - except StopIteration: - has_videos = False - - playlist = self.playlist_result( - self._entries(page, playlist_id), playlist_id, playlist_title) - playlist.update({ - 'uploader': uploader, - 'uploader_id': uploader_id, - 'uploader_url': uploader_url, - }) - - return has_videos, playlist - - def _check_download_just_video(self, url, playlist_id): - # Check if it's a video-specific URL - query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = query_dict.get('v', [None])[0] or self._search_regex( - r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url, - 'video id', default=None) - if video_id: - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return video_id, self.url_result(video_id, 'Youtube', video_id=video_id) - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - return video_id, None - return None, None - - def _real_extract(self, url): - # Extract playlist id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - playlist_id = mobj.group(1) or mobj.group(2) - - video_id, video = self._check_download_just_video(url, playlist_id) - if video: - return video - - if playlist_id.startswith(('RD', 'UL', 'PU')): - # Mixes require a custom extraction process - return self._extract_mix(playlist_id) - - has_videos, playlist = self._extract_playlist(playlist_id) - if has_videos or not video_id: - return playlist - - # Some playlist URLs don't actually serve a playlist (see - # https://github.com/ytdl-org/youtube-dl/issues/10537). - # Fallback to plain video extraction if there is a video id - # along with playlist id. - return self.url_result(video_id, 'Youtube', video_id=video_id) - - -class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): - IE_DESC = 'YouTube.com channels' - _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' - _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' - IE_NAME = 'youtube:channel' - _TESTS = [{ - 'note': 'paginated channel', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'playlist_mincount': 91, - 'info_dict': { - 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'Uploads from lex will', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - } - }, { - 'note': 'Age restricted channel', - # from https://www.youtube.com/user/DeusExOfficial - 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w', - 'playlist_mincount': 64, - 'info_dict': { - 'id': 'UUs0ifCMCm1icqRbqhUINa0w', - 'title': 'Uploads from Deus Ex', - 'uploader': 'Deus Ex', - 'uploader_id': 'DeusExOfficial', - }, - }, { - 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', - 'only_matching': True, - }, { - 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA', - 'only_matching': True, }] @classmethod def suitable(cls, url): - return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) - else super(YoutubeChannelIE, cls).suitable(url)) - - def _build_template_url(self, url, channel_id): - return self._TEMPLATE_URL % channel_id + return False if YoutubeTabIE.suitable(url) else super( + YoutubePlaylistIE, cls).suitable(url) def _real_extract(self, url): - channel_id = self._match_id(url) - - url = self._build_template_url(url, channel_id) - - # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) - # Workaround by extracting as a playlist if managed to obtain channel playlist URL - # otherwise fallback on channel by page extraction - channel_page = self._download_webpage( - url + '?view=57', channel_id, - 'Downloading channel page', fatal=False) - if channel_page is False: - channel_playlist_id = False - else: - channel_playlist_id = self._html_search_meta( - 'channelId', channel_page, 'channel id', default=None) - if not channel_playlist_id: - channel_url = self._html_search_meta( - ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'), - channel_page, 'channel url', default=None) - if channel_url: - channel_playlist_id = self._search_regex( - r'vnd\.youtube://user/([0-9A-Za-z_-]+)', - channel_url, 'channel id', default=None) - if channel_playlist_id and channel_playlist_id.startswith('UC'): - playlist_id = 'UU' + channel_playlist_id[2:] - return self.url_result( - compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist') - - channel_page = self._download_webpage(url, channel_id, 'Downloading page #1') - autogenerated = re.search(r'''(?x) - class="[^"]*?(?: - channel-header-autogenerated-label| - yt-channel-title-autogenerated - )[^"]*"''', channel_page) is not None - - if autogenerated: - # The videos are contained in a single page - # the ajax pages can't be used, they are empty - entries = [ - self.url_result( - video_id, 'Youtube', video_id=video_id, - video_title=video_title) - for video_id, video_title in self.extract_videos_from_page(channel_page)] - return self.playlist_result(entries, channel_id) - - try: - next(self._entries(channel_page, channel_id)) - except StopIteration: - alert_message = self._html_search_regex( - r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>', - channel_page, 'alert', default=None, group='alert') - if alert_message: - raise ExtractorError('Youtube said: %s' % alert_message, expected=True) - - return self.playlist_result(self._entries(channel_page, channel_id), channel_id) + playlist_id = self._match_id(url) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + if not qs: + qs = {'list': playlist_id} + return self.url_result( + update_url_query('https://www.youtube.com/playlist', qs), + ie=YoutubeTabIE.ie_key(), video_id=playlist_id) -class YoutubeUserIE(YoutubeChannelIE): - IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' - IE_NAME = 'youtube:user' - +class YoutubeYtUserIE(InfoExtractor): + _VALID_URL = r'ytuser:(?P<id>.+)' _TESTS = [{ - 'url': 'https://www.youtube.com/user/TheLinuxFoundation', - 'playlist_mincount': 320, - 'info_dict': { - 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', - 'title': 'Uploads from The Linux Foundation', - 'uploader': 'The Linux Foundation', - 'uploader_id': 'TheLinuxFoundation', - } - }, { - # Only available via https://www.youtube.com/c/12minuteathlete/videos - # but not https://www.youtube.com/user/12minuteathlete/videos - 'url': 'https://www.youtube.com/c/12minuteathlete/videos', - 'playlist_mincount': 249, - 'info_dict': { - 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', - 'title': 'Uploads from 12 Minute Athlete', - 'uploader': '12 Minute Athlete', - 'uploader_id': 'the12minuteathlete', - } - }, { 'url': 'ytuser:phihag', 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/gametrailers', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/gametrailers', - 'only_matching': True, - }, { - # This channel is not available, geo restricted to JP - 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', - 'only_matching': True, }] - @classmethod - def suitable(cls, url): - # Don't return True if the url can be extracted with other youtube - # extractor, the regex would is too permissive and it would match. - other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls) - if any(ie.suitable(url) for ie in other_yt_ies): - return False - else: - return super(YoutubeUserIE, cls).suitable(url) - - def _build_template_url(self, url, channel_id): - mobj = re.match(self._VALID_URL, url) - return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id')) + def _real_extract(self, url): + user_id = self._match_id(url) + return self.url_result( + 'https://www.youtube.com/user/%s' % user_id, + ie=YoutubeTabIE.ie_key(), video_id=user_id) -class YoutubeLiveIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com live streams' - _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live' - IE_NAME = 'youtube:live' - +class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): + IE_NAME = 'youtube:favorites' + IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)' + _VALID_URL = r':ytfav(?:ou?rite)?s?' + _LOGIN_REQUIRED = True _TESTS = [{ - 'url': 'https://www.youtube.com/user/TheYoungTurks/live', - 'info_dict': { - 'id': 'a48o2S1cPoo', - 'ext': 'mp4', - 'title': 'The Young Turks - Live Main Show', - 'uploader': 'The Young Turks', - 'uploader_id': 'TheYoungTurks', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', - 'upload_date': '20150715', - 'license': 'Standard YouTube License', - 'description': 'md5:438179573adcdff3c97ebb1ee632b891', - 'categories': ['News & Politics'], - 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'url': ':ytfav', 'only_matching': True, }, { - 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/TheYoungTurks/live', + 'url': ':ytfavorites', 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - base_url = mobj.group('base_url') - webpage = self._download_webpage(url, channel_id, fatal=False) - if webpage: - page_type = self._og_search_property( - 'type', webpage, 'page type', default='') - video_id = self._html_search_meta( - 'videoId', webpage, 'video id', default=None) - if page_type.startswith('video') and video_id and re.match( - r'^[0-9A-Za-z_-]{11}$', video_id): - return self.url_result(video_id, YoutubeIE.ie_key()) - return self.url_result(base_url) + return self.url_result( + 'https://www.youtube.com/playlist?list=LL', + ie=YoutubeTabIE.ie_key()) -class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com user/channel playlists' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists' - IE_NAME = 'youtube:playlists' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', - 'playlist_mincount': 4, - 'info_dict': { - 'id': 'ThirstForScience', - 'title': 'ThirstForScience', - }, - }, { - # with "Load more" button - 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', - 'playlist_mincount': 70, - 'info_dict': { - 'id': 'igorkle1', - 'title': 'Игорь Клейнер', - }, - }, { - 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists', - 'playlist_mincount': 17, - 'info_dict': { - 'id': 'UCiU1dHvZObB2iP6xkJ__Icw', - 'title': 'Chem Player', - }, - 'skip': 'Blocked', - }, { - 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', - 'only_matching': True, - }] - - -class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' - - -class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): +class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com searches' # there doesn't appear to be a real limit, for example if you search for # 'python' you get more than 8.000.000 results @@ -3190,10 +3469,33 @@ def _entries(self, query, n): list) if not slr_contents: break - isr_contents = try_get( - slr_contents, - lambda x: x[0]['itemSectionRenderer']['contents'], - list) + + isr_contents = [] + continuation_token = None + # Youtube sometimes adds promoted content to searches, + # changing the index location of videos and token. + # So we search through all entries till we find them. + for index, isr in enumerate(slr_contents): + if not isr_contents: + isr_contents = try_get( + slr_contents, + (lambda x: x[index]['itemSectionRenderer']['contents']), + list) + for content in isr_contents: + if content.get('videoRenderer') is not None: + break + else: + isr_contents = [] + + if continuation_token is None: + continuation_token = try_get( + slr_contents, + lambda x: x[index]['continuationItemRenderer']['continuationEndpoint']['continuationCommand'][ + 'token'], + compat_str) + if continuation_token is not None and isr_contents: + break + if not isr_contents: break for content in isr_contents: @@ -3227,13 +3529,9 @@ def _entries(self, query, n): } if total == n: return - token = try_get( - slr_contents, - lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], - compat_str) - if not token: + if not continuation_token: break - data['continuation'] = token + data['continuation'] = continuation_token def _get_n_results(self, query, n): """Get a specified number of results for a query""" @@ -3247,11 +3545,11 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _SEARCH_PARAMS = 'CAI%3D' -class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): +class YoutubeSearchURLIE(YoutubeSearchIE): IE_DESC = 'YouTube.com search URLs' - IE_NAME = 'youtube:search_url' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' - _SEARCH_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' + IE_NAME = YoutubeSearchIE.IE_NAME + '_url' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)' + # _MAX_RESULTS = 100 _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -3263,92 +3561,25 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): 'only_matching': True, }] - def _find_videos_in_json(self, extracted): - videos = [] - - def _real_find(obj): - if obj is None or isinstance(obj, str): - return - - if type(obj) is list: - for elem in obj: - _real_find(elem) - - if type(obj) is dict: - if "videoId" in obj: - videos.append(obj) - return - - for _, o in obj.items(): - _real_find(o) - - _real_find(extracted) - - return videos - - def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): - search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None) - - result_items = self._find_videos_in_json(search_response) - - for renderer in result_items: - video_id = try_get(renderer, lambda x: x['videoId']) - video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText']) - - if video_id is None or video_title is None: - # we do not have a videoRenderer or title extraction broke - continue - - video_title = video_title.strip() - - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) + @classmethod + def _make_valid_url(cls): + return cls._VALID_URL def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - return self.playlist_result(self._process_page(webpage), playlist_title=query) + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + query = (qs.get('search_query') or qs.get('q'))[0] + self._SEARCH_PARAMS = qs.get('sp', ('',))[0] + return self._get_n_results(query, self._MAX_RESULTS) -class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com (multi-season) shows' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)' - IE_NAME = 'youtube:show' - _TESTS = [{ - 'url': 'https://www.youtube.com/show/airdisasters', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'airdisasters', - 'title': 'Air Disasters', - } - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - return super(YoutubeShowIE, self)._real_extract( - 'https://www.youtube.com/show/%s/playlists' % playlist_id) - - -class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): +class YoutubeFeedsInfoExtractor(YoutubeTabIE): """ Base class for feed extractors - Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. + Subclasses must define the _FEED_NAME property. """ _LOGIN_REQUIRED = True - _FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' - _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" + # _MAX_PAGES = 5 + _TESTS = [] @property def IE_NAME(self): @@ -3357,150 +3588,63 @@ def IE_NAME(self): def _real_initialize(self): self._login() - def _find_videos_in_json(self, extracted): - videos = [] - c = {} - - def _real_find(obj): - if obj is None or isinstance(obj, str): - return - - if type(obj) is list: - for elem in obj: - _real_find(elem) - - if type(obj) is dict: - if "videoId" in obj: - videos.append(obj) - return - - if "nextContinuationData" in obj: - c["continuation"] = obj["nextContinuationData"] - return - - for _, o in obj.items(): - _real_find(o) - - _real_find(extracted) - - return videos, try_get(c, lambda x: x["continuation"]) - - def _entries(self, page): - info = [] - - yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False) - - search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) - - for page_num in itertools.count(1): - video_info, continuation = self._find_videos_in_json(search_response) - - new_info = [] - - for v in video_info: - v_id = try_get(v, lambda x: x['videoId']) - if not v_id: - continue - - have_video = False - for old in info: - if old['videoId'] == v_id: - have_video = True - break - - if not have_video: - new_info.append(v) - - if not new_info: - break - - info.extend(new_info) - - for video in new_info: - yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText'])) - - if not continuation or not yt_conf: - break - - search_response = self._download_json( - 'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape, - query={ - "ctoken": try_get(continuation, lambda x: x["continuation"]), - "continuation": try_get(continuation, lambda x: x["continuation"]), - "itct": try_get(continuation, lambda x: x["clickTrackingParams"]) - }, - headers={ - "X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]), - "X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]), - "X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]), - "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]), - "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]), - "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]), - "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]), - }) - def _real_extract(self, url): - page = self._download_webpage( + return self.url_result( 'https://www.youtube.com/feed/%s' % self._FEED_NAME, - self._PLAYLIST_TITLE) - return self.playlist_result( - self._entries(page), playlist_title=self._PLAYLIST_TITLE) + ie=YoutubeTabIE.ie_key()) -class YoutubeWatchLaterIE(YoutubePlaylistIE): +class YoutubeWatchLaterIE(InfoExtractor): IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' - + _VALID_URL = r':ytwatchlater' _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=WL', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL', + 'url': ':ytwatchlater', 'only_matching': True, }] def _real_extract(self, url): - _, video = self._check_download_just_video(url, 'WL') - if video: - return video - _, playlist = self._extract_playlist('WL') - return playlist - - -class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): - IE_NAME = 'youtube:favorites' - IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' - _LOGIN_REQUIRED = True - - def _real_extract(self, url): - webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') - playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id') - return self.url_result(playlist_id, 'YoutubePlaylist') + return self.url_result( + 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key()) class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' - _PLAYLIST_TITLE = 'Youtube Recommended videos' + _TESTS = [{ + 'url': ':ytrec', + 'only_matching': True, + }, { + 'url': ':ytrecommended', + 'only_matching': True, + }, { + 'url': 'https://youtube.com', + 'only_matching': True, + }] class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' + IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)' + _VALID_URL = r':ytsub(?:scription)?s?' _FEED_NAME = 'subscriptions' - _PLAYLIST_TITLE = 'Youtube Subscriptions' + _TESTS = [{ + 'url': ':ytsubs', + 'only_matching': True, + }, { + 'url': ':ytsubscriptions', + 'only_matching': True, + }] class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' + _VALID_URL = r':ythistory' _FEED_NAME = 'history' - _PLAYLIST_TITLE = 'Youtube History' + _TESTS = [{ + 'url': ':ythistory', + 'only_matching': True, + }] class YoutubeTruncatedURLIE(InfoExtractor): @@ -3567,3 +3711,25 @@ def _real_extract(self, url): raise ExtractorError( 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url), expected=True) + + +# Do Youtube show urls even exist anymore? I couldn't find any +r''' +class YoutubeShowIE(YoutubeTabIE): + IE_DESC = 'YouTube.com (multi-season) shows' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)' + IE_NAME = 'youtube:show' + _TESTS = [{ + 'url': 'https://www.youtube.com/show/airdisasters', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'airdisasters', + 'title': 'Air Disasters', + } + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + return super(YoutubeShowIE, self)._real_extract( + 'https://www.youtube.com/show/%s/playlists' % playlist_id) +''' diff --git a/youtube_dlc/extractor/zoom.py b/youtube_dlc/extractor/zoom.py new file mode 100644 index 000000000..038a90297 --- /dev/null +++ b/youtube_dlc/extractor/zoom.py @@ -0,0 +1,82 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + url_or_none, + parse_filesize, + urlencode_postdata +) + + +class ZoomIE(InfoExtractor): + IE_NAME = 'zoom' + _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/(play|share)/(?P<id>[A-Za-z0-9\-_.]+)' + + _TEST = { + 'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK', + 'info_dict': { + 'md5': '031a5b379f1547a8b29c5c4c837dccf2', + 'title': "GAZ Transformational Tuesdays W/ Landon & Stapes", + 'id': "SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK", + 'ext': "mp4" + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + password_protected = self._search_regex(r'<form[^>]+?id="(password_form)"', webpage, 'password field', fatal=False, default=None) + if password_protected is not None: + self._verify_video_password(url, display_id, webpage) + webpage = self._download_webpage(url, display_id) + + video_url = self._search_regex(r"viewMp4Url: \'(.*)\'", webpage, 'video url') + title = self._html_search_regex([r"topic: \"(.*)\",", r"<title>(.*) - Zoom"], webpage, 'title') + viewResolvtionsWidth = self._search_regex(r"viewResolvtionsWidth: (\d*)", webpage, 'res width', fatal=False) + viewResolvtionsHeight = self._search_regex(r"viewResolvtionsHeight: (\d*)", webpage, 'res height', fatal=False) + fileSize = parse_filesize(self._search_regex(r"fileSize: \'(.+)\'", webpage, 'fileSize', fatal=False)) + + urlprefix = url.split("zoom.us")[0] + "zoom.us/" + + formats = [] + formats.append({ + 'url': url_or_none(video_url), + 'width': int_or_none(viewResolvtionsWidth), + 'height': int_or_none(viewResolvtionsHeight), + 'http_headers': {'Accept': 'video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5', + 'Referer': urlprefix}, + 'ext': "mp4", + 'filesize_approx': int_or_none(fileSize) + }) + self._sort_formats(formats) + + return { + 'id': display_id, + 'title': title, + 'formats': formats + } + + def _verify_video_password(self, url, video_id, webpage): + password = self._downloader.params.get('videopassword') + if password is None: + raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) + meetId = self._search_regex(r']+?id="meetId" value="([^\"]+)"', webpage, 'meetId') + data = urlencode_postdata({ + 'id': meetId, + 'passwd': password, + 'action': "viewdetailedpage", + 'recaptcha': "" + }) + validation_url = url.split("zoom.us")[0] + "zoom.us/rec/validate_meet_passwd" + validation_response = self._download_json( + validation_url, video_id, + note='Validating Password...', + errnote='Wrong password?', + data=data) + + if validation_response['errorCode'] != 0: + raise ExtractorError('Login failed, %s said: %r' % (self.IE_NAME, validation_response['errorMessage'])) diff --git a/youtube_dlc/options.py b/youtube_dlc/options.py index 1d7a7fed2..9ad8a6ddd 100644 --- a/youtube_dlc/options.py +++ b/youtube_dlc/options.py @@ -344,6 +344,10 @@ def _comma_separated_values_options_callback(option, opt_str, value, parser): '--download-archive', metavar='FILE', dest='download_archive', help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.') + selection.add_option( + '--break-on-existing', + action='store_true', dest='break_on_existing', default=False, + help="Stop the download process after attempting to download a file that's in the archive.") selection.add_option( '--include-ads', dest='include_ads', action='store_true', @@ -582,7 +586,7 @@ def _comma_separated_values_options_callback(option, opt_str, value, parser): 'along with --min-sleep-interval.')) workarounds.add_option( '--sleep-subtitles', - dest='sleep_interval_subtitles', action='store_true', default=False, + dest='sleep_interval_subtitles', default=0, type=int, help='Enforce sleep interval on subtitles as well') verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') diff --git a/youtube_dlc/postprocessor/embedthumbnail.py b/youtube_dlc/postprocessor/embedthumbnail.py index 4a0d02fc4..e9f2161a0 100644 --- a/youtube_dlc/postprocessor/embedthumbnail.py +++ b/youtube_dlc/postprocessor/embedthumbnail.py @@ -89,12 +89,15 @@ def is_webp(path): os.rename(encodeFilename(temp_filename), encodeFilename(filename)) elif info['ext'] == 'mkv': - os.rename(encodeFilename(thumbnail_filename), encodeFilename('cover.jpg')) old_thumbnail_filename = thumbnail_filename - thumbnail_filename = 'cover.jpg' + thumbnail_filename = os.path.join(os.path.dirname(old_thumbnail_filename), 'cover.jpg') + if os.path.exists(thumbnail_filename): + os.remove(encodeFilename(thumbnail_filename)) + os.rename(encodeFilename(old_thumbnail_filename), encodeFilename(thumbnail_filename)) options = [ - '-c', 'copy', '-attach', thumbnail_filename, '-metadata:s:t', 'mimetype=image/jpeg'] + '-c', 'copy', '-map', '0', + '-attach', thumbnail_filename, '-metadata:s:t', 'mimetype=image/jpeg'] self._downloader.to_screen('[ffmpeg] Adding thumbnail to "%s"' % filename) @@ -140,6 +143,6 @@ def is_webp(path): os.remove(encodeFilename(filename)) os.rename(encodeFilename(temp_filename), encodeFilename(filename)) else: - raise EmbedThumbnailPPError('Only mp3 and m4a/mp4 are supported for thumbnail embedding for now.') + raise EmbedThumbnailPPError('Only mp3, mkv, m4a and mp4 are supported for thumbnail embedding for now.') return [], info diff --git a/youtube_dlc/postprocessor/ffmpeg.py b/youtube_dlc/postprocessor/ffmpeg.py index 5e85f4eeb..c7071d73d 100644 --- a/youtube_dlc/postprocessor/ffmpeg.py +++ b/youtube_dlc/postprocessor/ffmpeg.py @@ -359,7 +359,7 @@ def run(self, information): if information['ext'] == self._preferedformat: self._downloader.to_screen('[ffmpeg] Not remuxing video file %s - already is in target format %s' % (path, self._preferedformat)) return [], information - options = ['-c', 'copy'] + options = ['-c', 'copy', '-map', '0'] prefix, sep, ext = path.rpartition('.') outpath = prefix + sep + self._preferedformat self._downloader.to_screen('[' + 'ffmpeg' + '] Remuxing video from %s to %s, Destination: ' % (information['ext'], self._preferedformat) + outpath) @@ -412,7 +412,9 @@ def run(self, information): for lang, sub_info in subtitles.items(): sub_ext = sub_info['ext'] - if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': + if sub_ext == 'json': + self._downloader.to_screen('[ffmpeg] JSON subtitles cannot be embedded') + elif ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': sub_langs.append(lang) sub_filenames.append(subtitles_filename(filename, lang, sub_ext, ext)) else: @@ -426,8 +428,7 @@ def run(self, information): input_files = [filename] + sub_filenames opts = [ - '-map', '0', - '-c', 'copy', + '-c', 'copy', '-map', '0', # Don't copy the existing subtitles, we may be running the # postprocessor a second time '-map', '-0:s', @@ -577,7 +578,7 @@ def run(self, info): filename = info['filepath'] temp_filename = prepend_extension(filename, 'temp') - options = ['-c', 'copy', '-aspect', '%f' % stretched_ratio] + options = ['-c', 'copy', '-map', '0', '-aspect', '%f' % stretched_ratio] self._downloader.to_screen('[ffmpeg] Fixing aspect ratio in "%s"' % filename) self.run_ffmpeg(filename, temp_filename, options) @@ -595,7 +596,7 @@ def run(self, info): filename = info['filepath'] temp_filename = prepend_extension(filename, 'temp') - options = ['-c', 'copy', '-f', 'mp4'] + options = ['-c', 'copy', '-map', '0', '-f', 'mp4'] self._downloader.to_screen('[ffmpeg] Correcting container in "%s"' % filename) self.run_ffmpeg(filename, temp_filename, options) @@ -611,7 +612,7 @@ def run(self, info): if self.get_audio_codec(filename) == 'aac': temp_filename = prepend_extension(filename, 'temp') - options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] + options = ['-c', 'copy', '-map', '0', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] self._downloader.to_screen('[ffmpeg] Fixing malformed AAC bitstream in "%s"' % filename) self.run_ffmpeg(filename, temp_filename, options) @@ -643,13 +644,18 @@ def run(self, info): self._downloader.to_screen( '[ffmpeg] Subtitle file for %s is already in the requested format' % new_ext) continue + elif ext == 'json': + self._downloader.to_screen( + '[ffmpeg] You have requested to convert json subtitles into another format, ' + 'which is currently not possible') + continue old_file = subtitles_filename(filename, lang, ext, info.get('ext')) sub_filenames.append(old_file) new_file = subtitles_filename(filename, lang, new_ext, info.get('ext')) if ext in ('dfxp', 'ttml', 'tt'): self._downloader.report_warning( - 'You have requested to convert dfxp (TTML) subtitles into another format, ' + '[ffmpeg] You have requested to convert dfxp (TTML) subtitles into another format, ' 'which results in style information loss') dfxp_file = old_file diff --git a/youtube_dlc/update.py b/youtube_dlc/update.py index e49e09c17..b358e902b 100644 --- a/youtube_dlc/update.py +++ b/youtube_dlc/update.py @@ -37,10 +37,26 @@ def update_self(to_screen, verbose, opener): JSON_URL = UPDATE_URL + 'versions.json' UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537) + def sha256sum(): + h = hashlib.sha256() + b = bytearray(128 * 1024) + mv = memoryview(b) + with open(os.path.realpath(sys.executable), 'rb', buffering=0) as f: + for n in iter(lambda: f.readinto(mv), 0): + h.update(mv[:n]) + return h.hexdigest() + + to_screen('Current Build Hash %s' % sha256sum()) + if not isinstance(globals().get('__loader__'), zipimporter) and not hasattr(sys, 'frozen'): to_screen('It looks like you installed youtube-dlc with a package manager, pip, setup.py or a tarball. Please use that to update.') return + # compiled file.exe can find itself by + # to_screen(os.path.basename(sys.executable)) + # and path to py or exe + # to_screen(os.path.realpath(sys.executable)) + # Check if there is a new version try: newversion = opener.open(VERSION_URL).read().decode('utf-8').strip() @@ -48,6 +64,7 @@ def update_self(to_screen, verbose, opener): if verbose: to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: can\'t find the current version. Please try again later.') + to_screen('Visit https://github.com/blackjack4494/yt-dlc/releases/latest') return if newversion == __version__: to_screen('youtube-dlc is up-to-date (' + __version__ + ')') @@ -61,6 +78,7 @@ def update_self(to_screen, verbose, opener): if verbose: to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: can\'t obtain versions info. Please try again later.') + to_screen('Visit https://github.com/blackjack4494/yt-dlc/releases/latest') return if 'signature' not in versions_info: to_screen('ERROR: the versions file is not signed or corrupted. Aborting.') @@ -109,6 +127,7 @@ def version_tuple(version_str): if verbose: to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: unable to download latest version') + to_screen('Visit https://github.com/blackjack4494/yt-dlc/releases/latest') return newcontent_hash = hashlib.sha256(newcontent).hexdigest() @@ -155,6 +174,7 @@ def version_tuple(version_str): if verbose: to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: unable to download latest version') + to_screen('Visit https://github.com/blackjack4494/yt-dlc/releases/latest') return newcontent_hash = hashlib.sha256(newcontent).hexdigest() diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py index 54a4ea2aa..68b4ca944 100644 --- a/youtube_dlc/utils.py +++ b/youtube_dlc/utils.py @@ -2320,8 +2320,8 @@ def bug_reports_message(): if ytdl_is_updateable(): update_cmd = 'type youtube-dlc -U to update' else: - update_cmd = 'see https://yt-dl.org/update on how to update' - msg = '; please report this issue on https://yt-dl.org/bug .' + update_cmd = 'see https://github.com/blackjack4494/yt-dlc on how to update' + msg = '; please report this issue on https://github.com/blackjack4494/yt-dlc .' msg += ' Make sure you are using the latest version; %s.' % update_cmd msg += ' Be sure to call youtube-dlc with the --verbose flag and include its complete output.' return msg @@ -2460,7 +2460,7 @@ def __init__(self, code=None, msg='Unknown error'): # Parsing code and msg if (self.code in (errno.ENOSPC, errno.EDQUOT) - or 'No space left' in self.msg or 'Disk quota excedded' in self.msg): + or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg): self.reason = 'NO_SPACE' elif self.code == errno.E2BIG or 'Argument list too long' in self.msg: self.reason = 'VALUE_TOO_LONG' @@ -4085,7 +4085,7 @@ def fix_kv(m): v = m.group(0) if v in ('true', 'false', 'null'): return v - elif v.startswith('/*') or v.startswith('//') or v == ',': + elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': return "" if v[0] in ("'", '"'): @@ -4095,12 +4095,12 @@ def fix_kv(m): '\\\n': '', '\\x': '\\u00', }.get(m.group(0), m.group(0)), v[1:-1]) - - for regex, base in INTEGER_TABLE: - im = re.match(regex, v) - if im: - i = int(im.group(1), base) - return '"%d":' % i if v.endswith(':') else '%d' % i + else: + for regex, base in INTEGER_TABLE: + im = re.match(regex, v) + if im: + i = int(im.group(1), base) + return '"%d":' % i if v.endswith(':') else '%d' % i return '"%s"' % v @@ -4110,7 +4110,8 @@ def fix_kv(m): {comment}|,(?={skip}[\]}}])| (?:(?