diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index f2260db465..3a94bd621f 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.07.28** +- [ ] I've verified that I'm running youtube-dl version **2019.11.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.07.28 + [debug] youtube-dl version 2019.11.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 8bc05c4ba7..72bee12aa2 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.07.28** +- [ ] I've verified that I'm running youtube-dl version **2019.11.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 98348e0cd6..ddf67e9518 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.07.28** +- [ ] I've verified that I'm running youtube-dl version **2019.11.28** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 86706f5289..7122e2714d 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.07.28** +- [ ] I've verified that I'm running youtube-dl version **2019.11.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.07.28 + [debug] youtube-dl version 2019.11.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 52c2709f94..a93882b39d 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.07.28** +- [ ] I've verified that I'm running youtube-dl version **2019.11.28** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/.travis.yml b/.travis.yml index 51afd469af..14d95fa84c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,7 @@ dist: trusty env: - YTDL_TEST_SET=core - YTDL_TEST_SET=download -jobs: +matrix: include: - python: 3.7 dist: xenial @@ -35,11 +35,6 @@ jobs: env: YTDL_TEST_SET=download - env: JYTHON=true; YTDL_TEST_SET=core - env: JYTHON=true; YTDL_TEST_SET=download - - name: flake8 - python: 3.8 - dist: xenial - install: pip install flake8 - script: flake8 . fast_finish: true allow_failures: - env: YTDL_TEST_SET=download diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 58ab3a4b89..ac759ddc4e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -153,7 +153,7 @@ After you have ensured this site is distributing its content legally, you can fo 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](http://flake8.pycqa.org/en/latest/index.html#quickstart): $ flake8 youtube_dl/extractor/yourextractor.py diff --git a/ChangeLog b/ChangeLog index bf515f784b..d2f17ee067 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,316 +1,10 @@ -version 2020.07.28 - -Extractors -* [youtube] Fix sigfunc name extraction (#26134, #26135, #26136, #26137) -* [youtube] Improve description extraction (#25937, #25980) -* [wistia] Restrict embed regular expression (#25969) -* [youtube] Prevent excess HTTP 301 (#25786) -+ [youtube:playlists] Extend URL regular expression (#25810) -+ [bellmedia] Add support for cp24.com clip URLs (#25764) -* [brightcove] Improve embed detection (#25674) - - -version 2020.06.16.1 - -Extractors -* [youtube] Force old layout (#25682, #25683, #25680, #25686) -* [youtube] Fix categories and improve tags extraction - - -version 2020.06.16 - -Extractors -* [youtube] Fix uploader id and uploader URL extraction -* [youtube] Improve view count extraction -* [youtube] Fix upload date extraction (#25677) -* [youtube] Fix thumbnails extraction (#25676) -* [youtube] Fix playlist and feed extraction (#25675) -+ [facebook] Add support for single-video ID links -+ [youtube] Extract chapters from JSON (#24819) -+ [kaltura] Add support for multiple embeds on a webpage (#25523) - - -version 2020.06.06 - -Extractors -* [tele5] Bypass geo restriction -+ [jwplatform] Add support for bypass geo restriction -* [tele5] Prefer jwplatform over nexx (#25533) -* [twitch:stream] Expect 400 and 410 HTTP errors from API -* [twitch:stream] Fix extraction (#25528) -* [twitch] Fix thumbnails extraction (#25531) -+ [twitch] Pass v5 Accept HTTP header (#25531) -* [brightcove] Fix subtitles extraction (#25540) -+ [malltv] Add support for sk.mall.tv (#25445) -* [periscope] Fix untitled broadcasts (#25482) -* [jwplatform] Improve embeds extraction (#25467) - - -version 2020.05.29 - -Core -* [postprocessor/ffmpeg] Embed series metadata with --add-metadata -* [utils] Fix file permissions in write_json_file (#12471, #25122) - -Extractors -* [ard:beta] Extend URL regular expression (#25405) -+ [youtube] Add support for more invidious instances (#25417) -* [giantbomb] Extend URL regular expression (#25222) -* [ard] Improve URL regular expression (#25134, #25198) -* [redtube] Improve formats extraction and extract m3u8 formats (#25311, - #25321) -* [indavideo] Switch to HTTPS for API request (#25191) -* [redtube] Improve title extraction (#25208) -* [vimeo] Improve format extraction and sorting (#25285) -* [soundcloud] Reduce API playlist page limit (#25274) -+ [youtube] Add support for yewtu.be (#25226) -* [mailru] Fix extraction (#24530, #25239) -* [bellator] Fix mgid extraction (#25195) - - -version 2020.05.08 - -Core -* [downloader/http] Request last data block of exact remaining size -* [downloader/http] Finish downloading once received data length matches - expected -* [extractor/common] Use compat_cookiejar_Cookie for _set_cookie to always - ensure cookie name and value are bytestrings on python 2 (#23256, #24776) -+ [compat] Introduce compat_cookiejar_Cookie -* [utils] Improve cookie files support - + Add support for UTF-8 in cookie files - * Skip malformed cookie file entries instead of crashing (invalid entry - length, invalid expires at) - -Extractors -* [youtube] Improve signature cipher extraction (#25187, #25188) -* [iprima] Improve extraction (#25138) -* [uol] Fix extraction (#22007) -+ [orf] Add support for more radio stations (#24938, #24968) -* [dailymotion] Fix typo -- [puhutv] Remove no longer available HTTP formats (#25124) - - -version 2020.05.03 - -Core -+ [extractor/common] Extract multiple JSON-LD entries -* [options] Clarify doc on --exec command (#19087, #24883) -* [extractor/common] Skip malformed ISM manifest XMLs while extracting - ISM formats (#24667) - -Extractors -* [crunchyroll] Fix and improve extraction (#25096, #25060) -* [youtube] Improve player id extraction -* [youtube] Use redirected video id if any (#25063) -* [yahoo] Fix GYAO Player extraction and relax URL regular expression - (#24178, #24778) -* [tvplay] Fix Viafree extraction (#15189, #24473, #24789) -* [tenplay] Relax URL regular expression (#25001) -+ [prosiebensat1] Extract series metadata -* [prosiebensat1] Improve extraction and remove 7tv.de support (#24948) -- [prosiebensat1] Remove 7tv.de support (#24948) -* [youtube] Fix DRM videos detection (#24736) -* [thisoldhouse] Fix video id extraction (#24548, #24549) -+ [soundcloud] Extract AAC format (#19173, #24708) -* [youtube] Skip broken multifeed videos (#24711) -* [nova:embed] Fix extraction (#24700) -* [motherless] Fix extraction (#24699) -* [twitch:clips] Extend URL regular expression (#24290, #24642) -* [tv4] Fix ISM formats extraction (#24667) -* [tele5] Fix extraction (#24553) -+ [mofosex] Add support for generic embeds (#24633) -+ [youporn] Add support for generic embeds -+ [spankwire] Add support for generic embeds (#24633) -* [spankwire] Fix extraction (#18924, #20648) - - -version 2020.03.24 - -Core -- [utils] Revert support for cookie files with spaces used instead of tabs - -Extractors -* [teachable] Update upskillcourses and gns3 domains -* [generic] Look for teachable embeds before wistia -+ [teachable] Extract chapter metadata (#24421) -+ [bilibili] Add support for player.bilibili.com (#24402) -+ [bilibili] Add support for new URL schema with BV ids (#24439, #24442) -* [limelight] Remove disabled API requests (#24255) -* [soundcloud] Fix download URL extraction (#24394) -+ [cbc:watch] Add support for authentication (#19160) -* [hellporno] Fix extraction (#24399) -* [xtube] Fix formats extraction (#24348) -* [ndr] Fix extraction (#24326) -* [nhk] Update m3u8 URL and use native HLS downloader (#24329) -- [nhk] Remove obsolete rtmp formats (#24329) -* [nhk] Relax URL regular expression (#24329) -- [vimeo] Revert fix showcase password protected video extraction (#24224) - - -version 2020.03.08 - -Core -+ [utils] Add support for cookie files with spaces used instead of tabs - -Extractors -+ [pornhub] Add support for pornhubpremium.com (#24288) -- [youtube] Remove outdated code and unnecessary requests -* [youtube] Improve extraction in 429 HTTP error conditions (#24283) -* [nhk] Update API version (#24270) - - -version 2020.03.06 - -Extractors -* [youtube] Fix age-gated videos support without login (#24248) -* [vimeo] Fix showcase password protected video extraction (#24224) -* [pornhub] Improve title extraction (#24184) -* [peertube] Improve extraction (#23657) -+ [servus] Add support for new URL schema (#23475, #23583, #24142) -* [vimeo] Fix subtitles URLs (#24209) - - -version 2020.03.01 - -Core -* [YoutubeDL] Force redirect URL to unicode on python 2 -- [options] Remove duplicate short option -v for --version (#24162) - -Extractors -* [xhamster] Fix extraction (#24205) -* [franceculture] Fix extraction (#24204) -+ [telecinco] Add support for article opening videos -* [telecinco] Fix extraction (#24195) -* [xtube] Fix metadata extraction (#21073, #22455) -* [youjizz] Fix extraction (#24181) -- Remove no longer needed compat_str around geturl -* [pornhd] Fix extraction (#24128) -+ [teachable] Add support for multiple videos per lecture (#24101) -+ [wistia] Add support for multiple generic embeds (#8347, 11385) -* [imdb] Fix extraction (#23443) -* [tv2dk:bornholm:play] Fix extraction (#24076) - - -version 2020.02.16 - -Core -* [YoutubeDL] Fix playlist entry indexing with --playlist-items (#10591, - #10622) -* [update] Fix updating via symlinks (#23991) -+ [compat] Introduce compat_realpath (#23991) - -Extractors -+ [npr] Add support for streams (#24042) -+ [24video] Add support for porn.24video.net (#23779, #23784) -- [jpopsuki] Remove extractor (#23858) -* [nova] Improve extraction (#23690) -* [nova:embed] Improve (#23690) -* [nova:embed] Fix extraction (#23672) -+ [abc:iview] Add support for 720p (#22907, #22921) -* [nytimes] Improve format sorting (#24010) -+ [toggle] Add support for mewatch.sg (#23895, #23930) -* [thisoldhouse] Fix extraction (#23951) -+ [popcorntimes] Add support for popcorntimes.tv (#23949) -* [sportdeutschland] Update to new API -* [twitch:stream] Lowercase channel id for stream request (#23917) -* [tv5mondeplus] Fix extraction (#23907, #23911) -* [tva] Relax URL regular expression (#23903) -* [vimeo] Fix album extraction (#23864) -* [viewlift] Improve extraction - * Fix extraction (#23851) - + Add support for authentication - + Add support for more domains -* [svt] Fix series extraction (#22297) -* [svt] Fix article extraction (#22897, #22919) -* [soundcloud] Imporve private playlist/set tracks extraction (#3707) - - -version 2020.01.24 - -Extractors -* [youtube] Fix sigfunc name extraction (#23819) -* [stretchinternet] Fix extraction (#4319) -* [voicerepublic] Fix extraction -* [azmedien] Fix extraction (#23783) -* [businessinsider] Fix jwplatform id extraction (#22929, #22954) -+ [24video] Add support for 24video.vip (#23753) -* [ivi:compilation] Fix entries extraction (#23770) -* [ard] Improve extraction (#23761) - * Simplify extraction - + Extract age limit and series - * Bypass geo-restriction -+ [nbc] Add support for nbc multi network URLs (#23049) -* [americastestkitchen] Fix extraction -* [zype] Improve extraction - + Extract subtitles (#21258) - + Support URLs with alternative keys/tokens (#21258) - + Extract more metadata -* [orf:tvthek] Improve geo restricted videos detection (#23741) -* [soundcloud] Restore previews extraction (#23739) - - -version 2020.01.15 - -Extractors -* [yourporn] Fix extraction (#21645, #22255, #23459) -+ [canvas] Add support for new API endpoint (#17680, #18629) -* [ndr:base:embed] Improve thumbnails extraction (#23731) -+ [vodplatform] Add support for embed.kwikmotion.com domain -+ [twitter] Add support for promo_video_website cards (#23711) -* [orf:radio] Clean description and improve extraction -* [orf:fm4] Fix extraction (#23599) -* [safari] Fix kaltura session extraction (#23679, #23670) -* [lego] Fix extraction and extract subtitle (#23687) -* [cloudflarestream] Improve extraction - + Add support for bytehighway.net domain - + Add support for signed URLs - + Extract thumbnail -* [naver] Improve extraction - * Improve geo-restriction handling - + Extract automatic captions - + Extract uploader metadata - + Extract VLive HLS formats - * Improve metadata extraction -- [pandatv] Remove extractor (#23630) -* [dctp] Fix format extraction (#23656) -+ [scrippsnetworks] Add support for www.discovery.com videos -* [discovery] Fix anonymous token extraction (#23650) -* [nrktv:seriebase] Fix extraction (#23625, #23537) -* [wistia] Improve format extraction and extract subtitles (#22590) -* [vice] Improve extraction (#23631) -* [redtube] Detect private videos (#23518) - - -version 2020.01.01 - -Extractors -* [brightcove] Invalidate policy key cache on failing requests -* [pornhub] Improve locked videos detection (#22449, #22780) -+ [pornhub] Add support for m3u8 formats -* [pornhub] Fix extraction (#22749, #23082) -* [brightcove] Update policy key on failing requests -* [spankbang] Improve removed video detection (#23423) -* [spankbang] Fix extraction (#23307, #23423, #23444) -* [soundcloud] Automatically update client id on failing requests -* [prosiebensat1] Improve geo restriction handling (#23571) -* [brightcove] Cache brightcove player policy keys -* [teachable] Fail with error message if no video URL found -* [teachable] Improve locked lessons detection (#23528) -+ [scrippsnetworks] Add support for Scripps Networks sites (#19857, #22981) -* [mitele] Fix extraction (#21354, #23456) -* [soundcloud] Update client id (#23516) -* [mailru] Relax URL regular expressions (#23509) - - -version 2019.12.25 +version Core * [utils] Improve str_to_int + [downloader/hls] Add ability to override AES decryption key URL (#17521) Extractors -* [mediaset] Fix parse formats (#23508) + [tv2dk:bornholm:play] Add support for play.tv2bornholm.dk (#23291) + [slideslive] Add support for url and vimeo service names (#23414) * [slideslive] Fix extraction (#23413) diff --git a/README.md b/README.md index 45326c69ec..01f975958c 100644 --- a/README.md +++ b/README.md @@ -434,9 +434,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo either the path to the binary or its containing directory. --exec CMD Execute a command on the file after - downloading and post-processing, similar to - find's -exec syntax. Example: --exec 'adb - push {} /sdcard/Music/ && rm {}' + downloading, similar to find's -exec + syntax. Example: --exec 'adb push {} + /sdcard/Music/ && rm {}' --convert-subs FORMAT Convert the subtitles to other format (currently supported: srt|ass|vtt|lrc) @@ -835,9 +835,7 @@ In February 2015, the new YouTube player contained a character sequence in a str ### HTTP Error 429: Too Many Requests or 402: Payment Required -These two error codes indicate that the service is blocking your IP address because of overuse. Usually this is a soft block meaning that you can gain access again after solving CAPTCHA. Just open a browser and solve a CAPTCHA the service suggests you and after that [pass cookies](#how-do-i-pass-cookies-to-youtube-dl) to youtube-dl. Note that if your machine has multiple external IPs then you should also pass exactly the same IP you've used for solving CAPTCHA with [`--source-address`](#network-options). Also you may need to pass a `User-Agent` HTTP header of your browser with [`--user-agent`](#workarounds). - -If this is not the case (no CAPTCHA suggested to solve by the service) then you can contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address. +These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address. ### SyntaxError: Non-ASCII character @@ -1032,7 +1030,7 @@ After you have ensured this site is distributing its content legally, you can fo 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](http://flake8.pycqa.org/en/latest/index.html#quickstart): $ flake8 youtube_dl/extractor/yourextractor.py diff --git a/devscripts/create-github-release.py b/devscripts/create-github-release.py index 2ddfa10969..428111b3f0 100644 --- a/devscripts/create-github-release.py +++ b/devscripts/create-github-release.py @@ -1,6 +1,7 @@ #!/usr/bin/env python from __future__ import unicode_literals +import base64 import io import json import mimetypes @@ -14,6 +15,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.compat import ( compat_basestring, + compat_input, compat_getpass, compat_print, compat_urllib_request, @@ -38,20 +40,28 @@ class GitHubReleaser(object): try: info = netrc.netrc().authenticators(self._NETRC_MACHINE) if info is not None: - self._token = info[2] + self._username = info[0] + self._password = info[2] compat_print('Using GitHub credentials found in .netrc...') return else: compat_print('No GitHub credentials found in .netrc') except (IOError, netrc.NetrcParseError): compat_print('Unable to parse .netrc') - self._token = compat_getpass( - 'Type your GitHub PAT (personal access token) and press [Return]: ') + self._username = compat_input( + 'Type your GitHub username or email address and press [Return]: ') + self._password = compat_getpass( + 'Type your GitHub password and press [Return]: ') def _call(self, req): if isinstance(req, compat_basestring): req = sanitized_Request(req) - req.add_header('Authorization', 'token %s' % self._token) + # Authorizing manually since GitHub does not response with 401 with + # WWW-Authenticate header set (see + # https://developer.github.com/v3/#basic-authentication) + b64 = base64.b64encode( + ('%s:%s' % (self._username, self._password)).encode('utf-8')).decode('ascii') + req.add_header('Authorization', 'Basic %s' % b64) response = self._opener.open(req).read().decode('utf-8') return json.loads(response) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 35c1050e54..2744dfca84 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -28,11 +28,10 @@ - **acast:channel** - **ADN**: Anime Digital Network - **AdobeConnect** - - **adobetv** - - **adobetv:channel** - - **adobetv:embed** - - **adobetv:show** - - **adobetv:video** + - **AdobeTV** + - **AdobeTVChannel** + - **AdobeTVShow** + - **AdobeTVVideo** - **AdultSwim** - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault - **afreecatv**: afreecatv.com @@ -98,7 +97,6 @@ - **BiliBili** - **BilibiliAudio** - **BilibiliAudioAlbum** - - **BiliBiliPlayer** - **BioBioChileTV** - **BIQLE** - **BitChute** @@ -390,6 +388,7 @@ - **JeuxVideo** - **Joj** - **Jove** + - **jpopsuki.tv** - **JWPlatform** - **Kakao** - **Kaltura** @@ -397,7 +396,6 @@ - **Kankan** - **Karaoketv** - **KarriereVideos** - - **Katsomo** - **KeezMovies** - **Ketnet** - **KhanAcademy** @@ -405,6 +403,7 @@ - **KinjaEmbed** - **KinoPoisk** - **KonserthusetPlay** + - **kontrtube**: KontrTube.ru - Труба зовёт - **KrasView**: Красвью - **Ku6** - **KUSI** @@ -497,7 +496,6 @@ - **MNetTV** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net - **Mofosex** - - **MofosexEmbed** - **Mojvideo** - **Morningstar**: morningstar.com - **Motherless** @@ -515,6 +513,7 @@ - **mtvjapan** - **mtvservices:embedded** - **MuenchenTV**: münchen.tv + - **MusicPlayOn** - **mva**: Microsoft Virtual Academy videos - **mva:course**: Microsoft Virtual Academy courses - **Mwave** @@ -620,25 +619,16 @@ - **Ooyala** - **OoyalaExternal** - **OraTV** - - **orf:burgenland**: Radio Burgenland - **orf:fm4**: radio FM4 - **orf:fm4:story**: fm4.orf.at stories - **orf:iptv**: iptv.ORF.at - - **orf:kaernten**: Radio Kärnten - - **orf:noe**: Radio Niederösterreich - - **orf:oberoesterreich**: Radio Oberösterreich - **orf:oe1**: Radio Österreich 1 - - **orf:oe3**: Radio Österreich 3 - - **orf:salzburg**: Radio Salzburg - - **orf:steiermark**: Radio Steiermark - - **orf:tirol**: Radio Tirol - **orf:tvthek**: ORF TVthek - - **orf:vorarlberg**: Radio Vorarlberg - - **orf:wien**: Radio Wien - **OsnatelTV** - **OutsideTV** - **PacktPub** - **PacktPubCourse** + - **PandaTV**: 熊猫TV - **pandora.tv**: 판도라TV - **ParamountNetwork** - **parliamentlive.tv**: UK parliament videos @@ -674,7 +664,6 @@ - **Pokemon** - **PolskieRadio** - **PolskieRadioCategory** - - **Popcorntimes** - **PopcornTV** - **PornCom** - **PornerBros** @@ -772,7 +761,6 @@ - **screen.yahoo:search**: Yahoo screen search - **Screencast** - **ScreencastOMatic** - - **ScrippsNetworks** - **scrippsnetworks:watch** - **SCTE** - **SCTECourse** @@ -925,7 +913,6 @@ - **tv2.hu** - **TV2Article** - **TV2DK** - - **TV2DKBornholmPlay** - **TV4**: tv4.se and tv4play.se - **TV5MondePlus**: TV5MONDE+ - **TVA** @@ -967,7 +954,6 @@ - **udemy** - **udemy:course** - **UDNEmbed**: 聯合影音 - - **UFCArabia** - **UFCTV** - **UKTVPlay** - **umg:de**: Universal Music Deutschland @@ -1007,6 +993,7 @@ - **videomore** - **videomore:season** - **videomore:video** + - **VideoPremium** - **VideoPress** - **Vidio** - **VidLii** @@ -1016,8 +1003,8 @@ - **Vidzi** - **vier**: vier.be and vijf.be - **vier:videos** - - **viewlift** - - **viewlift:embed** + - **ViewLift** + - **ViewLiftEmbed** - **Viidea** - **viki** - **viki:channel** diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 1e204e551b..ce96661716 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -816,15 +816,11 @@ class TestYoutubeDL(unittest.TestCase): 'webpage_url': 'http://example.com', } - def get_downloaded_info_dicts(params): - ydl = YDL(params) - # make a deep copy because the dictionary and nested entries - # can be modified - ydl.process_ie_result(copy.deepcopy(playlist)) - return ydl.downloaded_info_dicts - def get_ids(params): - return [int(v['id']) for v in get_downloaded_info_dicts(params)] + ydl = YDL(params) + # make a copy because the dictionary can be modified + ydl.process_ie_result(playlist.copy()) + return [int(v['id']) for v in ydl.downloaded_info_dicts] result = get_ids({}) self.assertEqual(result, [1, 2, 3, 4]) @@ -856,22 +852,6 @@ class TestYoutubeDL(unittest.TestCase): result = get_ids({'playlist_items': '2-4,3-4,3'}) self.assertEqual(result, [2, 3, 4]) - # Tests for https://github.com/ytdl-org/youtube-dl/issues/10591 - # @{ - result = get_downloaded_info_dicts({'playlist_items': '2-4,3-4,3'}) - self.assertEqual(result[0]['playlist_index'], 2) - self.assertEqual(result[1]['playlist_index'], 3) - - result = get_downloaded_info_dicts({'playlist_items': '2-4,3-4,3'}) - self.assertEqual(result[0]['playlist_index'], 2) - self.assertEqual(result[1]['playlist_index'], 3) - self.assertEqual(result[2]['playlist_index'], 4) - - result = get_downloaded_info_dicts({'playlist_items': '4,2'}) - self.assertEqual(result[0]['playlist_index'], 4) - self.assertEqual(result[1]['playlist_index'], 2) - # @} - def test_urlopen_no_file_protocol(self): # see https://github.com/ytdl-org/youtube-dl/issues/8227 ydl = YDL() diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py index 05f48bd741..f959798deb 100644 --- a/test/test_YoutubeDLCookieJar.py +++ b/test/test_YoutubeDLCookieJar.py @@ -39,13 +39,6 @@ class TestYoutubeDLCookieJar(unittest.TestCase): assert_cookie_has_value('HTTPONLY_COOKIE') assert_cookie_has_value('JS_ACCESSIBLE_COOKIE') - def test_malformed_cookies(self): - cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/malformed_cookies.txt') - cookiejar.load(ignore_discard=True, ignore_expires=True) - # Cookies should be empty since all malformed cookie file entries - # will be ignored - self.assertFalse(cookiejar._cookies) - if __name__ == '__main__': unittest.main() diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 17aaaf20d9..7d57a628e5 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -26,6 +26,7 @@ from youtube_dl.extractor import ( ThePlatformIE, ThePlatformFeedIE, RTVEALaCartaIE, + FunnyOrDieIE, DemocracynowIE, ) @@ -321,6 +322,18 @@ class TestRtveSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['es']), '69e70cae2d40574fb7316f31d6eb7fca') +class TestFunnyOrDieSubtitles(BaseTestSubtitles): + url = 'http://www.funnyordie.com/videos/224829ff6d/judd-apatow-will-direct-your-vine' + IE = FunnyOrDieIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), 'c5593c193eacd353596c11c2d4f9ecc4') + + class TestDemocracynowSubtitles(BaseTestSubtitles): url = 'http://www.democracynow.org/shows/2015/7/3' IE = DemocracynowIE diff --git a/test/test_youtube_chapters.py b/test/test_youtube_chapters.py index e69c57377e..324ca85257 100644 --- a/test/test_youtube_chapters.py +++ b/test/test_youtube_chapters.py @@ -267,7 +267,7 @@ class TestYoutubeChapters(unittest.TestCase): for description, duration, expected_chapters in self._TEST_CASES: ie = YoutubeIE() expect_value( - self, ie._extract_chapters_from_description(description, duration), + self, ie._extract_chapters(description, duration), expected_chapters, None) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 69df30edaa..f0c370eeed 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -74,28 +74,6 @@ _TESTS = [ ] -class TestPlayerInfo(unittest.TestCase): - def test_youtube_extract_player_info(self): - PLAYER_URLS = ( - ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/en_US/base.js', '64dddad9'), - # obsolete - ('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'), - ('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'), - ('https://www.youtube.com/yts/jsbin/player_ias-vflCPQUIL/en_US/base.js', 'vflCPQUIL'), - ('https://www.youtube.com/yts/jsbin/player-vflzQZbt7/en_US/base.js', 'vflzQZbt7'), - ('https://www.youtube.com/yts/jsbin/player-en_US-vflaxXRn1/base.js', 'vflaxXRn1'), - ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'vflXGBaUN'), - ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'vflKjOTVq'), - ('http://s.ytimg.com/yt/swfbin/watch_as3-vflrEm9Nq.swf', 'vflrEm9Nq'), - ('https://s.ytimg.com/yts/swfbin/player-vflenCdZL/watch_as3.swf', 'vflenCdZL'), - ) - for player_url, expected_player_id in PLAYER_URLS: - expected_player_type = player_url.split('.')[-1] - player_type, player_id = YoutubeIE._extract_player_info(player_url) - self.assertEqual(player_type, expected_player_type) - self.assertEqual(player_id, expected_player_id) - - class TestSignature(unittest.TestCase): def setUp(self): TEST_DIR = os.path.dirname(os.path.abspath(__file__)) diff --git a/test/testdata/cookies/malformed_cookies.txt b/test/testdata/cookies/malformed_cookies.txt deleted file mode 100644 index 17bc403547..0000000000 --- a/test/testdata/cookies/malformed_cookies.txt +++ /dev/null @@ -1,9 +0,0 @@ -# Netscape HTTP Cookie File -# http://curl.haxx.se/rfc/cookie_spec.html -# This is a generated file! Do not edit. - -# Cookie file entry with invalid number of fields - 6 instead of 7 -www.foobar.foobar FALSE / FALSE 0 COOKIE - -# Cookie file entry with invalid expires at -www.foobar.foobar FALSE / FALSE 1.7976931348623157e+308 COOKIE VALUE diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 19370f62b0..f5cb463081 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -92,7 +92,6 @@ from .utils import ( YoutubeDLCookieJar, YoutubeDLCookieProcessor, YoutubeDLHandler, - YoutubeDLRedirectHandler, ) from .cache import Cache from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER @@ -991,7 +990,7 @@ class YoutubeDL(object): 'playlist_title': ie_result.get('title'), 'playlist_uploader': ie_result.get('uploader'), 'playlist_uploader_id': ie_result.get('uploader_id'), - 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, + 'playlist_index': i + playliststart, 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], 'webpage_url_basename': url_basename(ie_result['webpage_url']), @@ -2344,7 +2343,6 @@ class YoutubeDL(object): debuglevel = 1 if self.params.get('debug_printtraffic') else 0 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) - redirect_handler = YoutubeDLRedirectHandler() data_handler = compat_urllib_request_DataHandler() # When passing our own FileHandler instance, build_opener won't add the @@ -2358,7 +2356,7 @@ class YoutubeDL(object): file_handler.file_open = file_open opener = compat_urllib_request.build_opener( - proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler) + proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler) # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 0ee9bc7602..c75ab131b9 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -57,17 +57,6 @@ try: except ImportError: # Python 2 import cookielib as compat_cookiejar -if sys.version_info[0] == 2: - class compat_cookiejar_Cookie(compat_cookiejar.Cookie): - def __init__(self, version, name, value, *args, **kwargs): - if isinstance(name, compat_str): - name = name.encode() - if isinstance(value, compat_str): - value = value.encode() - compat_cookiejar.Cookie.__init__(self, version, name, value, *args, **kwargs) -else: - compat_cookiejar_Cookie = compat_cookiejar.Cookie - try: import http.cookies as compat_cookies except ImportError: # Python 2 @@ -2765,17 +2754,6 @@ else: compat_expanduser = os.path.expanduser -if compat_os_name == 'nt' and sys.version_info < (3, 8): - # os.path.realpath on Windows does not follow symbolic links - # prior to Python 3.8 (see https://bugs.python.org/issue9949) - def compat_realpath(path): - while os.path.islink(path): - path = os.path.abspath(os.readlink(path)) - return path -else: - compat_realpath = os.path.realpath - - if sys.version_info < (3, 0): def compat_print(s): from .utils import preferredencoding @@ -2998,7 +2976,6 @@ __all__ = [ 'compat_basestring', 'compat_chr', 'compat_cookiejar', - 'compat_cookiejar_Cookie', 'compat_cookies', 'compat_ctypes_WINFUNCTYPE', 'compat_etree_Element', @@ -3021,7 +2998,6 @@ __all__ = [ 'compat_os_name', 'compat_parse_qs', 'compat_print', - 'compat_realpath', 'compat_setenv', 'compat_shlex_quote', 'compat_shlex_split', diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 5046878dfc..3c72ea18b2 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -227,7 +227,7 @@ class HttpFD(FileDownloader): while True: try: # Download and write - data_block = ctx.data.read(block_size if data_len is None else min(block_size, data_len - byte_counter)) + data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) # socket.timeout is a subclass of socket.error but may not have # errno set except socket.timeout as e: @@ -299,7 +299,7 @@ class HttpFD(FileDownloader): 'elapsed': now - ctx.start_time, }) - if data_len is not None and byte_counter == data_len: + if is_test and byte_counter == data_len: break if not is_test and ctx.chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len: diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 6637f4f353..4ac323bf6d 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -110,17 +110,17 @@ class ABCIViewIE(InfoExtractor): # ABC iview programs are normally available for 14 days only. _TESTS = [{ - 'url': 'https://iview.abc.net.au/show/gruen/series/11/video/LE1927H001S00', - 'md5': '67715ce3c78426b11ba167d875ac6abf', + 'url': 'https://iview.abc.net.au/show/ben-and-hollys-little-kingdom/series/0/video/ZX9371A050S00', + 'md5': 'cde42d728b3b7c2b32b1b94b4a548afc', 'info_dict': { - 'id': 'LE1927H001S00', + 'id': 'ZX9371A050S00', 'ext': 'mp4', - 'title': "Series 11 Ep 1", - 'series': "Gruen", - 'description': 'md5:52cc744ad35045baf6aded2ce7287f67', - 'upload_date': '20190925', - 'uploader_id': 'abc1', - 'timestamp': 1569445289, + 'title': "Gaston's Birthday", + 'series': "Ben And Holly's Little Kingdom", + 'description': 'md5:f9de914d02f226968f598ac76f105bcf', + 'upload_date': '20180604', + 'uploader_id': 'abc4kids', + 'timestamp': 1528140219, }, 'params': { 'skip_download': True, @@ -148,7 +148,7 @@ class ABCIViewIE(InfoExtractor): 'hdnea': token, }) - for sd in ('720', 'sd', 'sd-low'): + for sd in ('sd', 'sd-low'): sd_url = try_get( stream, lambda x: x['streams']['hls'][sd], compat_str) if not sd_url: diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index 9c9d77ae10..8b32aa886e 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -5,7 +5,6 @@ from .common import InfoExtractor from ..utils import ( clean_html, int_or_none, - js_to_json, try_get, unified_strdate, ) @@ -14,21 +13,22 @@ from ..utils import ( class AmericasTestKitchenIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P\d+)' _TESTS = [{ - 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', + 'url': 'https://www.americastestkitchen.com/episode/548-summer-dinner-party', 'md5': 'b861c3e365ac38ad319cfd509c30577f', 'info_dict': { - 'id': '5b400b9ee338f922cb06450c', - 'title': 'Weeknight Japanese Suppers', + 'id': '1_5g5zua6e', + 'title': 'Summer Dinner Party', 'ext': 'mp4', - 'description': 'md5:3d0c1a44bb3b27607ce82652db25b4a8', - 'thumbnail': r're:^https?://', - 'timestamp': 1523664000, - 'upload_date': '20180414', - 'release_date': '20180414', + 'description': 'md5:858d986e73a4826979b6a5d9f8f6a1ec', + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1497285541, + 'upload_date': '20170612', + 'uploader_id': 'roger.metcalf@americastestkitchen.com', + 'release_date': '20170617', 'series': "America's Test Kitchen", - 'season_number': 18, - 'episode': 'Weeknight Japanese Suppers', - 'episode_number': 15, + 'season_number': 17, + 'episode': 'Summer Dinner Party', + 'episode_number': 24, }, 'params': { 'skip_download': True, @@ -47,7 +47,7 @@ class AmericasTestKitchenIE(InfoExtractor): self._search_regex( r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*', webpage, 'initial context'), - video_id, js_to_json) + video_id) ep_data = try_get( video_data, @@ -55,7 +55,17 @@ class AmericasTestKitchenIE(InfoExtractor): lambda x: x['videoDetail']['content']['data']), dict) ep_meta = ep_data.get('full_video', {}) - zype_id = ep_data.get('zype_id') or ep_meta['zype_id'] + zype_id = ep_meta.get('zype_id') + if zype_id: + embed_url = 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id + ie_key = 'Zype' + else: + partner_id = self._search_regex( + r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', + webpage, 'kaltura partner id') + external_id = ep_data.get('external_id') or ep_meta['external_id'] + embed_url = 'kaltura:%s:%s' % (partner_id, external_id) + ie_key = 'Kaltura' title = ep_data.get('title') or ep_meta.get('title') description = clean_html(ep_meta.get('episode_description') or ep_data.get( @@ -69,8 +79,8 @@ class AmericasTestKitchenIE(InfoExtractor): return { '_type': 'url_transparent', - 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id, - 'ie_key': 'Zype', + 'url': embed_url, + 'ie_key': ie_key, 'title': title, 'description': description, 'thumbnail': thumbnail, diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 5b7b2dd6d2..8adae46449 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import json import re from .common import InfoExtractor @@ -23,101 +22,7 @@ from ..utils import ( from ..compat import compat_etree_fromstring -class ARDMediathekBaseIE(InfoExtractor): - _GEO_COUNTRIES = ['DE'] - - def _extract_media_info(self, media_info_url, webpage, video_id): - media_info = self._download_json( - media_info_url, video_id, 'Downloading media JSON') - return self._parse_media_info(media_info, video_id, '"fsk"' in webpage) - - def _parse_media_info(self, media_info, video_id, fsk): - formats = self._extract_formats(media_info, video_id) - - if not formats: - if fsk: - raise ExtractorError( - 'This video is only available after 20:00', expected=True) - elif media_info.get('_geoblocked'): - self.raise_geo_restricted( - 'This video is not available due to geoblocking', - countries=self._GEO_COUNTRIES) - - self._sort_formats(formats) - - subtitles = {} - subtitle_url = media_info.get('_subtitleUrl') - if subtitle_url: - subtitles['de'] = [{ - 'ext': 'ttml', - 'url': subtitle_url, - }] - - return { - 'id': video_id, - 'duration': int_or_none(media_info.get('_duration')), - 'thumbnail': media_info.get('_previewImage'), - 'is_live': media_info.get('_isLive') is True, - 'formats': formats, - 'subtitles': subtitles, - } - - def _extract_formats(self, media_info, video_id): - type_ = media_info.get('_type') - media_array = media_info.get('_mediaArray', []) - formats = [] - for num, media in enumerate(media_array): - for stream in media.get('_mediaStreamArray', []): - stream_urls = stream.get('_stream') - if not stream_urls: - continue - if not isinstance(stream_urls, list): - stream_urls = [stream_urls] - quality = stream.get('_quality') - server = stream.get('_server') - for stream_url in stream_urls: - if not url_or_none(stream_url): - continue - ext = determine_ext(stream_url) - if quality != 'auto' and ext in ('f4m', 'm3u8'): - continue - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - update_url_query(stream_url, { - 'hdcore': '3.1.1', - 'plugin': 'aasp-3.1.1.69.124' - }), video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - if server and server.startswith('rtmp'): - f = { - 'url': server, - 'play_path': stream_url, - 'format_id': 'a%s-rtmp-%s' % (num, quality), - } - else: - f = { - 'url': stream_url, - 'format_id': 'a%s-%s-%s' % (num, ext, quality) - } - m = re.search( - r'_(?P\d+)x(?P\d+)\.mp4$', - stream_url) - if m: - f.update({ - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - if type_ == 'audio': - f['vcodec'] = 'none' - formats.append(f) - return formats - - -class ARDMediathekIE(ARDMediathekBaseIE): +class ARDMediathekIE(InfoExtractor): IE_NAME = 'ARD:mediathek' _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' @@ -158,6 +63,94 @@ class ARDMediathekIE(ARDMediathekBaseIE): def suitable(cls, url): return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) + def _extract_media_info(self, media_info_url, webpage, video_id): + media_info = self._download_json( + media_info_url, video_id, 'Downloading media JSON') + + formats = self._extract_formats(media_info, video_id) + + if not formats: + if '"fsk"' in webpage: + raise ExtractorError( + 'This video is only available after 20:00', expected=True) + elif media_info.get('_geoblocked'): + raise ExtractorError('This video is not available due to geo restriction', expected=True) + + self._sort_formats(formats) + + duration = int_or_none(media_info.get('_duration')) + thumbnail = media_info.get('_previewImage') + is_live = media_info.get('_isLive') is True + + subtitles = {} + subtitle_url = media_info.get('_subtitleUrl') + if subtitle_url: + subtitles['de'] = [{ + 'ext': 'ttml', + 'url': subtitle_url, + }] + + return { + 'id': video_id, + 'duration': duration, + 'thumbnail': thumbnail, + 'is_live': is_live, + 'formats': formats, + 'subtitles': subtitles, + } + + def _extract_formats(self, media_info, video_id): + type_ = media_info.get('_type') + media_array = media_info.get('_mediaArray', []) + formats = [] + for num, media in enumerate(media_array): + for stream in media.get('_mediaStreamArray', []): + stream_urls = stream.get('_stream') + if not stream_urls: + continue + if not isinstance(stream_urls, list): + stream_urls = [stream_urls] + quality = stream.get('_quality') + server = stream.get('_server') + for stream_url in stream_urls: + if not url_or_none(stream_url): + continue + ext = determine_ext(stream_url) + if quality != 'auto' and ext in ('f4m', 'm3u8'): + continue + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(stream_url, { + 'hdcore': '3.1.1', + 'plugin': 'aasp-3.1.1.69.124' + }), + video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + if server and server.startswith('rtmp'): + f = { + 'url': server, + 'play_path': stream_url, + 'format_id': 'a%s-rtmp-%s' % (num, quality), + } + else: + f = { + 'url': stream_url, + 'format_id': 'a%s-%s-%s' % (num, ext, quality) + } + m = re.search(r'_(?P\d+)x(?P\d+)\.mp4$', stream_url) + if m: + f.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + if type_ == 'audio': + f['vcodec'] = 'none' + formats.append(f) + return formats + def _real_extract(self, url): # determine video id from url m = re.match(self._VALID_URL, url) @@ -249,7 +242,7 @@ class ARDMediathekIE(ARDMediathekBaseIE): class ARDIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?P[0-9]+))\.html' + _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos/(?P[^/?#]+)-(?P[0-9]+))\.html' _TESTS = [{ # available till 14.02.2019 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', @@ -263,9 +256,6 @@ class ARDIE(InfoExtractor): 'upload_date': '20180214', 'thumbnail': r're:^https?://.*\.jpg$', }, - }, { - 'url': 'https://www.daserste.de/information/reportage-dokumentation/erlebnis-erde/videosextern/woelfe-und-herdenschutzhunde-ungleiche-brueder-102.html', - 'only_matching': True, }, { 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', 'only_matching': True, @@ -312,31 +302,21 @@ class ARDIE(InfoExtractor): } -class ARDBetaMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?:player|live|video)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' +class ARDBetaMediathekIE(InfoExtractor): + _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?:player|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' _TESTS = [{ - 'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', - 'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', + 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', + 'md5': '2d02d996156ea3c397cfc5036b5d7f8f', 'info_dict': { 'display_id': 'die-robuste-roswita', - 'id': '70153354', - 'title': 'Die robuste Roswita', + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', + 'title': 'Tatort: Die robuste Roswita', 'description': r're:^Der Mord.*trüber ist als die Ilm.', 'duration': 5316, - 'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', - 'timestamp': 1577047500, - 'upload_date': '20191222', + 'thumbnail': 'https://img.ardmediathek.de/standard/00/55/43/59/34/-1774185891/16x9/960?mandant=ard', + 'upload_date': '20180826', 'ext': 'mp4', }, - }, { - 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', - 'only_matching': True, - }, { - 'url': 'https://ardmediathek.de/ard/video/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/', - 'only_matching': True, - }, { - 'url': 'https://www.ardmediathek.de/ard/video/trailer/private-eyes-s01-e01/one/Y3JpZDovL3dkci5kZS9CZWl0cmFnLTE1MTgwYzczLWNiMTEtNGNkMS1iMjUyLTg5MGYzOWQxZmQ1YQ/', - 'only_matching': True, }, { 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/', 'only_matching': True, @@ -348,75 +328,73 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') - display_id = mobj.group('display_id') - if display_id: - display_id = display_id.rstrip('/') - if not display_id: - display_id = video_id + display_id = mobj.group('display_id') or video_id - player_page = self._download_json( - 'https://api.ardmediathek.de/public-gateway', - display_id, data=json.dumps({ - 'query': '''{ - playerPage(client:"%s", clipId: "%s") { - blockedByFsk - broadcastedOn - maturityContentRating - mediaCollection { - _duration - _geoblocked - _isLive - _mediaArray { - _mediaStreamArray { - _quality - _server - _stream - } - } - _previewImage - _subtitleUrl - _type - } - show { - title - } - synopsis - title - tracking { - atiCustomVars { - contentId - } - } - } -}''' % (mobj.group('client'), video_id), - }).encode(), headers={ - 'Content-Type': 'application/json' - })['data']['playerPage'] - title = player_page['title'] - content_id = str_or_none(try_get( - player_page, lambda x: x['tracking']['atiCustomVars']['contentId'])) - media_collection = player_page.get('mediaCollection') or {} - if not media_collection and content_id: - media_collection = self._download_json( - 'https://www.ardmediathek.de/play/media/' + content_id, - content_id, fatal=False) or {} - info = self._parse_media_info( - media_collection, content_id or video_id, - player_page.get('blockedByFsk')) - age_limit = None - description = player_page.get('synopsis') - maturity_content_rating = player_page.get('maturityContentRating') - if maturity_content_rating: - age_limit = int_or_none(maturity_content_rating.lstrip('FSK')) - if not age_limit and description: - age_limit = int_or_none(self._search_regex( - r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) - info.update({ - 'age_limit': age_limit, + webpage = self._download_webpage(url, display_id) + data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') + data = self._parse_json(data_json, display_id) + + res = { + 'id': video_id, 'display_id': display_id, - 'title': title, - 'description': description, - 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), - 'series': try_get(player_page, lambda x: x['show']['title']), + } + formats = [] + subtitles = {} + geoblocked = False + for widget in data.values(): + if widget.get('_geoblocked') is True: + geoblocked = True + if '_duration' in widget: + res['duration'] = int_or_none(widget['_duration']) + if 'clipTitle' in widget: + res['title'] = widget['clipTitle'] + if '_previewImage' in widget: + res['thumbnail'] = widget['_previewImage'] + if 'broadcastedOn' in widget: + res['timestamp'] = unified_timestamp(widget['broadcastedOn']) + if 'synopsis' in widget: + res['description'] = widget['synopsis'] + subtitle_url = url_or_none(widget.get('_subtitleUrl')) + if subtitle_url: + subtitles.setdefault('de', []).append({ + 'ext': 'ttml', + 'url': subtitle_url, + }) + if '_quality' in widget: + format_url = url_or_none(try_get( + widget, lambda x: x['_stream']['json'][0])) + if not format_url: + continue + ext = determine_ext(format_url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url + '?hdcore=3.11.0', + video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id='hls', + fatal=False)) + else: + # HTTP formats are not available when geoblocked is True, + # other formats are fine though + if geoblocked: + continue + quality = str_or_none(widget.get('_quality')) + formats.append({ + 'format_id': ('http-' + quality) if quality else 'http', + 'url': format_url, + 'preference': 10, # Plain HTTP, that's nice + }) + + if not formats and geoblocked: + self.raise_geo_restricted( + msg='This video is not available due to geoblocking', + countries=['DE']) + + self._sort_formats(formats) + res.update({ + 'subtitles': subtitles, + 'formats': formats, }) - return info + + return res diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index b1e20def53..fcbdc71b98 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -47,19 +47,39 @@ class AZMedienIE(InfoExtractor): 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True }] - _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/cb9f2f81ed22e9b47f4ca64ea3cc5a5d13e88d1d' + _PARTNER_ID = '1719221' def _real_extract(self, url): - host, display_id, article_id, entry_id = re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + video_id = mobj.group('id') + entry_id = mobj.group('kaltura_id') if not entry_id: - entry_id = self._download_json( - self._API_TEMPL % (host, host.split('.')[0]), display_id, query={ - 'variables': json.dumps({ - 'contextId': 'NewsArticle:' + article_id, - }), - })['data']['context']['mainAsset']['video']['kaltura']['kalturaId'] + api_url = 'https://www.%s/api/pub/gql/%s' % (host, host.split('.')[0]) + payload = { + 'query': '''query VideoContext($articleId: ID!) { + article: node(id: $articleId) { + ... on Article { + mainAssetRelation { + asset { + ... on VideoAsset { + kalturaId + } + } + } + } + } + }''', + 'variables': {'articleId': 'Article:%s' % mobj.group('article_id')}, + } + json_data = self._download_json( + api_url, video_id, headers={ + 'Content-Type': 'application/json', + }, + data=json.dumps(payload).encode()) + entry_id = json_data['data']['article']['mainAssetRelation']['asset']['kalturaId'] return self.url_result( 'kaltura:%s:%s' % (self._PARTNER_ID, entry_id), diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 002c39c394..901c5a54fb 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -528,7 +528,7 @@ class BBCCoUkIE(InfoExtractor): def get_programme_id(item): def get_from_attributes(item): - for p in ('identifier', 'group'): + for p in('identifier', 'group'): value = item.get(p) if value and re.match(r'^[pb][\da-z]{7}$', value): return value diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py index 9f9de96c61..485173774d 100644 --- a/youtube_dl/extractor/bellmedia.py +++ b/youtube_dl/extractor/bellmedia.py @@ -25,8 +25,8 @@ class BellMediaIE(InfoExtractor): etalk| marilyn )\.ca| - (?:much|cp24)\.com - )/.*?(?:\b(?:vid(?:eoid)?|clipId)=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6,})''' + much\.com + )/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6,})''' _TESTS = [{ 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', 'md5': '36d3ef559cfe8af8efe15922cd3ce950', @@ -62,9 +62,6 @@ class BellMediaIE(InfoExtractor): }, { 'url': 'http://www.etalk.ca/video?videoid=663455', 'only_matching': True, - }, { - 'url': 'https://www.cp24.com/video?clipId=1982548', - 'only_matching': True, }] _DOMAINS = { 'thecomedynetwork': 'comedy', diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 4dc597e160..80bd696e21 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -24,18 +24,7 @@ from ..utils import ( class BiliBiliIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:(?:www|bangumi)\.)? - bilibili\.(?:tv|com)/ - (?: - (?: - video/[aA][vV]| - anime/(?P\d+)/play\# - )(?P\d+)| - video/[bB][vV](?P[^/?#&]+) - ) - ''' + _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/(?P\d+)/play#)(?P\d+)' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', @@ -103,10 +92,6 @@ class BiliBiliIE(InfoExtractor): 'skip_download': True, # Test metadata only }, }] - }, { - # new BV video id format - 'url': 'https://www.bilibili.com/video/BV1JE411F741', - 'only_matching': True, }] _APP_KEY = 'iVGUTjsxvpLeuDCf' @@ -124,7 +109,7 @@ class BiliBiliIE(InfoExtractor): url, smuggled_data = unsmuggle_url(url, {}) mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') or mobj.group('id_bv') + video_id = mobj.group('id') anime_id = mobj.group('anime_id') webpage = self._download_webpage(url, video_id) @@ -434,17 +419,3 @@ class BilibiliAudioAlbumIE(BilibiliAudioBaseIE): entries, am_id, album_title, album_data.get('intro')) return self.playlist_result(entries, am_id) - - -class BiliBiliPlayerIE(InfoExtractor): - _VALID_URL = r'https?://player\.bilibili\.com/player\.html\?.*?\baid=(?P\d+)' - _TEST = { - 'url': 'http://player.bilibili.com/player.html?aid=92494333&cid=157926707&page=1', - 'only_matching': True, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result( - 'http://www.bilibili.tv/video/av%s/' % video_id, - ie=BiliBiliIE.ie_key(), video_id=video_id) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 2aa9f4782e..8e2f7217ab 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -5,34 +5,32 @@ import base64 import re import struct -from .adobepass import AdobePassIE from .common import InfoExtractor +from .adobepass import AdobePassIE from ..compat import ( compat_etree_fromstring, - compat_HTTPError, compat_parse_qs, compat_urllib_parse_urlparse, compat_urlparse, compat_xml_parse_error, + compat_HTTPError, ) from ..utils import ( - clean_html, - extract_attributes, ExtractorError, + extract_attributes, find_xpath_attr, fix_xml_ampersands, float_or_none, - int_or_none, js_to_json, - mimetype2ext, + int_or_none, parse_iso8601, smuggle_url, - str_or_none, unescapeHTML, unsmuggle_url, - UnsupportedError, update_url_query, - url_or_none, + clean_html, + mimetype2ext, + UnsupportedError, ) @@ -426,7 +424,7 @@ class BrightcoveNewIE(AdobePassIE): # [2] looks like: for video, script_tag, account_id, player_id, embed in re.findall( r'''(?isx) - (]*\bdata-video-id\s*=\s*['"]?[^>]+>) + (]*\bdata-video-id\s*=\s*['"]?[^>]+>) (?:.*? (]+ src=["\'](?:https?:)?//players\.brightcove\.net/ @@ -555,16 +553,10 @@ class BrightcoveNewIE(AdobePassIE): subtitles = {} for text_track in json_data.get('text_tracks', []): - if text_track.get('kind') != 'captions': - continue - text_track_url = url_or_none(text_track.get('src')) - if not text_track_url: - continue - lang = (str_or_none(text_track.get('srclang')) - or str_or_none(text_track.get('label')) or 'en').lower() - subtitles.setdefault(lang, []).append({ - 'url': text_track_url, - }) + if text_track.get('src'): + subtitles.setdefault(text_track.get('srclang'), []).append({ + 'url': text_track['src'], + }) is_live = False duration = float_or_none(json_data.get('duration'), 1000) @@ -594,63 +586,45 @@ class BrightcoveNewIE(AdobePassIE): account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups() - policy_key_id = '%s_%s' % (account_id, player_id) - policy_key = self._downloader.cache.load('brightcove', policy_key_id) - policy_key_extracted = False - store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x) + webpage = self._download_webpage( + 'http://players.brightcove.net/%s/%s_%s/index.min.js' + % (account_id, player_id, embed), video_id) - def extract_policy_key(): - webpage = self._download_webpage( - 'http://players.brightcove.net/%s/%s_%s/index.min.js' - % (account_id, player_id, embed), video_id) + policy_key = None - policy_key = None - - catalog = self._search_regex( - r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) if catalog: - catalog = self._parse_json( - js_to_json(catalog), video_id, fatal=False) - if catalog: - policy_key = catalog.get('policyKey') + policy_key = catalog.get('policyKey') - if not policy_key: - policy_key = self._search_regex( - r'policyKey\s*:\s*(["\'])(?P.+?)\1', - webpage, 'policy key', group='pk') - - store_pk(policy_key) - return policy_key + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P.+?)\1', + webpage, 'policy key', group='pk') api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) - headers = {} + headers = { + 'Accept': 'application/json;pk=%s' % policy_key, + } referrer = smuggled_data.get('referrer') if referrer: headers.update({ 'Referer': referrer, 'Origin': re.search(r'https?://[^/]+', referrer).group(0), }) - - for _ in range(2): - if not policy_key: - policy_key = extract_policy_key() - policy_key_extracted = True - headers['Accept'] = 'application/json;pk=%s' % policy_key - try: - json_data = self._download_json(api_url, video_id, headers=headers) - break - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): - json_data = self._parse_json(e.cause.read().decode(), video_id)[0] - message = json_data.get('message') or json_data['error_code'] - if json_data.get('error_subcode') == 'CLIENT_GEO': - self.raise_geo_restricted(msg=message) - elif json_data.get('error_code') == 'INVALID_POLICY_KEY' and not policy_key_extracted: - policy_key = None - store_pk(None) - continue - raise ExtractorError(message, expected=True) - raise + try: + json_data = self._download_json(api_url, video_id, headers=headers) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + json_data = self._parse_json(e.cause.read().decode(), video_id)[0] + message = json_data.get('message') or json_data['error_code'] + if json_data.get('error_subcode') == 'CLIENT_GEO': + self.raise_geo_restricted(msg=message) + raise ExtractorError(message, expected=True) + raise errors = json_data.get('errors') if errors and errors[0].get('error_subcode') == 'TVE_AUTH': diff --git a/youtube_dl/extractor/businessinsider.py b/youtube_dl/extractor/businessinsider.py index 73a57b1e4d..dfcf9bc6b5 100644 --- a/youtube_dl/extractor/businessinsider.py +++ b/youtube_dl/extractor/businessinsider.py @@ -9,26 +9,21 @@ class BusinessInsiderIE(InfoExtractor): _VALID_URL = r'https?://(?:[^/]+\.)?businessinsider\.(?:com|nl)/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://uk.businessinsider.com/how-much-radiation-youre-exposed-to-in-everyday-life-2016-6', - 'md5': 'ffed3e1e12a6f950aa2f7d83851b497a', + 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', 'info_dict': { - 'id': 'cjGDb0X9', + 'id': 'hZRllCfw', 'ext': 'mp4', - 'title': "Bananas give you more radiation exposure than living next to a nuclear power plant", - 'description': 'md5:0175a3baf200dd8fa658f94cade841b3', - 'upload_date': '20160611', - 'timestamp': 1465675620, + 'title': "Here's how much radiation you're exposed to in everyday life", + 'description': 'md5:9a0d6e2c279948aadaa5e84d6d9b99bd', + 'upload_date': '20170709', + 'timestamp': 1499606400, + }, + 'params': { + 'skip_download': True, }, }, { 'url': 'https://www.businessinsider.nl/5-scientifically-proven-things-make-you-less-attractive-2017-7/', - 'md5': '43f438dbc6da0b89f5ac42f68529d84a', - 'info_dict': { - 'id': '5zJwd4FK', - 'ext': 'mp4', - 'title': 'Deze dingen zorgen ervoor dat je minder snel een date scoort', - 'description': 'md5:2af8975825d38a4fed24717bbe51db49', - 'upload_date': '20170705', - 'timestamp': 1499270528, - }, + 'only_matching': True, }, { 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T', 'only_matching': True, @@ -40,8 +35,7 @@ class BusinessInsiderIE(InfoExtractor): jwplatform_id = self._search_regex( (r'data-media-id=["\']([a-zA-Z0-9]{8})', r'id=["\']jwplayer_([a-zA-Z0-9]{8})', - r'id["\']?\s*:\s*["\']?([a-zA-Z0-9]{8})', - r'(?:jwplatform\.com/players/|jwplayer_)([a-zA-Z0-9]{8})'), + r'id["\']?\s*:\s*["\']?([a-zA-Z0-9]{8})'), webpage, 'jwplatform id') return self.url_result( 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 8667a0d045..c506bc5dd2 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -13,8 +13,6 @@ from ..utils import ( int_or_none, merge_dicts, parse_iso8601, - str_or_none, - url_or_none, ) @@ -22,15 +20,15 @@ class CanvasIE(InfoExtractor): _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'md5': '68993eda72ef62386a15ea2cf3c93107', + 'md5': '90139b746a0a9bd7bb631283f6e2a64e', 'info_dict': { 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Nachtwacht: De Greystook', - 'description': 'Nachtwacht: De Greystook', + 'description': 'md5:1db3f5dc4c7109c821261e7512975be7', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.04, + 'duration': 1468.03, }, 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], }, { @@ -41,45 +39,23 @@ class CanvasIE(InfoExtractor): 'HLS': 'm3u8_native', 'HLS_AES': 'm3u8', } - _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) site_id, video_id = mobj.group('site_id'), mobj.group('id') - # Old API endpoint, serves more formats but may fail for some videos data = self._download_json( 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), video_id, 'Downloading asset JSON', - 'Unable to download asset JSON', fatal=False) - - # New API endpoint - if not data: - token = self._download_json( - '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', data=b'', - headers={'Content-Type': 'application/json'})['vrtPlayerToken'] - data = self._download_json( - '%s/videos/%s' % (self._REST_API_BASE, video_id), - video_id, 'Downloading video JSON', fatal=False, query={ - 'vrtPlayerToken': token, - 'client': '%s@PROD' % site_id, - }, expected_status=400) - message = data.get('message') - if message and not data.get('title'): - if data.get('code') == 'AUTHENTICATION_REQUIRED': - self.raise_login_required(message) - raise ExtractorError(message, expected=True) + % (site_id, video_id), video_id) title = data['title'] description = data.get('description') formats = [] for target in data['targetUrls']: - format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type')) + format_url, format_type = target.get('url'), target.get('type') if not format_url or not format_type: continue - format_type = format_type.upper() if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], @@ -158,20 +134,20 @@ class CanvasEenIE(InfoExtractor): }, 'skip': 'Pagina niet gevonden', }, { - 'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan', + 'url': 'https://www.een.be/sorry-voor-alles/herbekijk-sorry-voor-alles', 'info_dict': { - 'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8', - 'display_id': 'emma-pakt-thilly-aan', + 'id': 'mz-ast-11a587f8-b921-4266-82e2-0bce3e80d07f', + 'display_id': 'herbekijk-sorry-voor-alles', 'ext': 'mp4', - 'title': 'Emma pakt Thilly aan', - 'description': 'md5:c5c9b572388a99b2690030afa3f3bad7', + 'title': 'Herbekijk Sorry voor alles', + 'description': 'md5:8bb2805df8164e5eb95d6a7a29dc0dd3', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 118.24, + 'duration': 3788.06, }, 'params': { 'skip_download': True, }, - 'expected_warnings': ['is not a supported codec'], + 'skip': 'Episode no longer available', }, { 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', 'only_matching': True, @@ -207,44 +183,19 @@ class VrtNUIE(GigyaBaseIE): IE_DESC = 'VrtNU.be' _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?Pvrtnu)/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ - # Available via old API endpoint 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/', 'info_dict': { 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'De zwarte weduwe', - 'description': 'md5:db1227b0f318c849ba5eab1fef895ee4', + 'description': 'md5:d90c21dced7db869a85db89a623998d4', 'duration': 1457.04, 'thumbnail': r're:^https?://.*\.jpg$', - 'season': 'Season 1', + 'season': '1', 'season_number': 1, 'episode_number': 1, }, - 'skip': 'This video is only available for registered users', - 'params': { - 'username': '', - 'password': '', - }, - 'expected_warnings': ['is not a supported codec'], - }, { - # Only available via new API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/', - 'info_dict': { - 'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1', - 'ext': 'mp4', - 'title': 'Aflevering 5', - 'description': 'Wie valt door de mand tijdens een missie?', - 'duration': 2967.06, - 'season': 'Season 1', - 'season_number': 1, - 'episode_number': 5, - }, - 'skip': 'This video is only available for registered users', - 'params': { - 'username': '', - 'password': '', - }, - 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], + 'skip': 'This video is only available for registered users' }] _NETRC_MACHINE = 'vrtnu' _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index fd5ec6033b..751a3a8f26 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -1,10 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals -import hashlib import json import re -from xml.sax.saxutils import escape from .common import InfoExtractor from ..compat import ( @@ -218,29 +216,6 @@ class CBCWatchBaseIE(InfoExtractor): 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', } _GEO_COUNTRIES = ['CA'] - _LOGIN_URL = 'https://api.loginradius.com/identity/v2/auth/login' - _TOKEN_URL = 'https://cloud-api.loginradius.com/sso/jwt/api/token' - _API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' - _NETRC_MACHINE = 'cbcwatch' - - def _signature(self, email, password): - data = json.dumps({ - 'email': email, - 'password': password, - }).encode() - headers = {'content-type': 'application/json'} - query = {'apikey': self._API_KEY} - resp = self._download_json(self._LOGIN_URL, None, data=data, headers=headers, query=query) - access_token = resp['access_token'] - - # token - query = { - 'access_token': access_token, - 'apikey': self._API_KEY, - 'jwtapp': 'jwt', - } - resp = self._download_json(self._TOKEN_URL, None, headers=headers, query=query) - return resp['signature'] def _call_api(self, path, video_id): url = path if path.startswith('http') else self._API_BASE_URL + path @@ -264,8 +239,7 @@ class CBCWatchBaseIE(InfoExtractor): def _real_initialize(self): if self._valid_device_token(): return - device = self._downloader.cache.load( - 'cbcwatch', self._cache_device_key()) or {} + device = self._downloader.cache.load('cbcwatch', 'device') or {} self._device_id, self._device_token = device.get('id'), device.get('token') if self._valid_device_token(): return @@ -274,30 +248,16 @@ class CBCWatchBaseIE(InfoExtractor): def _valid_device_token(self): return self._device_id and self._device_token - def _cache_device_key(self): - email, _ = self._get_login_info() - return '%s_device' % hashlib.sha256(email.encode()).hexdigest() if email else 'device' - def _register_device(self): + self._device_id = self._device_token = None result = self._download_xml( self._API_BASE_URL + 'device/register', None, 'Acquiring device token', data=b'web') self._device_id = xpath_text(result, 'deviceId', fatal=True) - email, password = self._get_login_info() - if email and password: - signature = self._signature(email, password) - data = '{0}{1}web'.format( - escape(signature), escape(self._device_id)).encode() - url = self._API_BASE_URL + 'device/login' - result = self._download_xml( - url, None, data=data, - headers={'content-type': 'application/xml'}) - self._device_token = xpath_text(result, 'token', fatal=True) - else: - self._device_token = xpath_text(result, 'deviceToken', fatal=True) + self._device_token = xpath_text(result, 'deviceToken', fatal=True) self._downloader.cache.store( - 'cbcwatch', self._cache_device_key(), { + 'cbcwatch', 'device', { 'id': self._device_id, 'token': self._device_token, }) diff --git a/youtube_dl/extractor/cloudflarestream.py b/youtube_dl/extractor/cloudflarestream.py index 2fdcfbb3af..8ff2c65315 100644 --- a/youtube_dl/extractor/cloudflarestream.py +++ b/youtube_dl/extractor/cloudflarestream.py @@ -1,24 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import re from .common import InfoExtractor class CloudflareStreamIE(InfoExtractor): - _DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)' - _EMBED_RE = r'embed\.%s/embed/[^/]+\.js\?.*?\bvideo=' % _DOMAIN_RE - _ID_RE = r'[\da-f]{32}|[\w-]+\.[\w-]+\.[\w-]+' _VALID_URL = r'''(?x) https?:// (?: - (?:watch\.)?%s/| - %s + (?:watch\.)?(?:cloudflarestream\.com|videodelivery\.net)/| + embed\.(?:cloudflarestream\.com|videodelivery\.net)/embed/[^/]+\.js\?.*?\bvideo= ) - (?P%s) - ''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE) + (?P[\da-f]+) + ''' _TESTS = [{ 'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717', 'info_dict': { @@ -45,28 +41,23 @@ class CloudflareStreamIE(InfoExtractor): return [ mobj.group('url') for mobj in re.finditer( - r']+\bsrc=(["\'])(?P(?:https?:)?//%s(?:%s).*?)\1' % (CloudflareStreamIE._EMBED_RE, CloudflareStreamIE._ID_RE), + r']+\bsrc=(["\'])(?P(?:https?:)?//embed\.(?:cloudflarestream\.com|videodelivery\.net)/embed/[^/]+\.js\?.*?\bvideo=[\da-f]+?.*?)\1', webpage)] def _real_extract(self, url): video_id = self._match_id(url) - domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net' - base_url = 'https://%s/%s/' % (domain, video_id) - if '.' in video_id: - video_id = self._parse_json(base64.urlsafe_b64decode( - video_id.split('.')[1]), video_id)['sub'] - manifest_base_url = base_url + 'manifest/video.' formats = self._extract_m3u8_formats( - manifest_base_url + 'm3u8', video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False) + 'https://cloudflarestream.com/%s/manifest/video.m3u8' % video_id, + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False) formats.extend(self._extract_mpd_formats( - manifest_base_url + 'mpd', video_id, mpd_id='dash', fatal=False)) + 'https://cloudflarestream.com/%s/manifest/video.mpd' % video_id, + video_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) return { 'id': video_id, 'title': video_id, - 'thumbnail': base_url + 'thumbnails/thumbnail.jpg', 'formats': formats, } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a61753b17c..eaae5e484f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -15,7 +15,7 @@ import time import math from ..compat import ( - compat_cookiejar_Cookie, + compat_cookiejar, compat_cookies, compat_etree_Element, compat_etree_fromstring, @@ -1182,33 +1182,16 @@ class InfoExtractor(object): 'twitter card player') def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): - json_ld_list = list(re.finditer(JSON_LD_RE, html)) + json_ld = self._search_regex( + JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs) default = kwargs.get('default', NO_DEFAULT) + if not json_ld: + return default if default is not NO_DEFAULT else {} # JSON-LD may be malformed and thus `fatal` should be respected. # At the same time `default` may be passed that assumes `fatal=False` # for _search_regex. Let's simulate the same behavior here as well. fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False - json_ld = [] - for mobj in json_ld_list: - json_ld_item = self._parse_json( - mobj.group('json_ld'), video_id, fatal=fatal) - if not json_ld_item: - continue - if isinstance(json_ld_item, dict): - json_ld.append(json_ld_item) - elif isinstance(json_ld_item, (list, tuple)): - json_ld.extend(json_ld_item) - if json_ld: - json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) - if json_ld: - return json_ld - if default is not NO_DEFAULT: - return default - elif fatal: - raise RegexNotFoundError('Unable to extract JSON-LD') - else: - self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message()) - return {} + return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): if isinstance(json_ld, compat_str): @@ -1273,10 +1256,10 @@ class InfoExtractor(object): extract_interaction_statistic(e) for e in json_ld: - if '@context' in e: + if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')): item_type = e.get('@type') if expected_type is not None and expected_type != item_type: - continue + return info if item_type in ('TVEpisode', 'Episode'): episode_name = unescapeHTML(e.get('name')) info.update({ @@ -1310,17 +1293,11 @@ class InfoExtractor(object): }) elif item_type == 'VideoObject': extract_video_object(e) - if expected_type is None: - continue - else: - break + continue video = e.get('video') if isinstance(video, dict) and video.get('@type') == 'VideoObject': extract_video_object(video) - if expected_type is None: - continue - else: - break + break return dict((k, v) for k, v in info.items() if v is not None) @staticmethod @@ -2363,8 +2340,6 @@ class InfoExtractor(object): if res is False: return [] ism_doc, urlh = res - if ism_doc is None: - return [] return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) @@ -2843,7 +2818,7 @@ class InfoExtractor(object): def _set_cookie(self, domain, name, value, expire_time=None, port=None, path='/', secure=False, discard=False, rest={}, **kwargs): - cookie = compat_cookiejar_Cookie( + cookie = compat_cookiejar.Cookie( 0, name, value, port, port is not None, domain, True, domain.startswith('.'), path, True, secure, expire_time, discard, None, None, rest) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index bc2d1fa8b0..85a9a577f6 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -13,7 +13,6 @@ from ..compat import ( compat_b64decode, compat_etree_Element, compat_etree_fromstring, - compat_str, compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, @@ -26,9 +25,9 @@ from ..utils import ( intlist_to_bytes, int_or_none, lowercase_escape, - merge_dicts, remove_end, sanitized_Request, + unified_strdate, urlencode_postdata, xpath_text, ) @@ -137,7 +136,6 @@ class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): # rtmp 'skip_download': True, }, - 'skip': 'Video gone', }, { 'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1', 'info_dict': { @@ -159,12 +157,11 @@ class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): 'info_dict': { 'id': '702409', 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, + 'title': 'Re:ZERO -Starting Life in Another World- Episode 5 – The Morning of Our Promise Is Still Distant', + 'description': 'md5:97664de1ab24bbf77a9c01918cb7dca9', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Re:Zero Partners', - 'timestamp': 1462098900, - 'upload_date': '20160501', + 'uploader': 'TV TOKYO', + 'upload_date': '20160508', }, 'params': { # m3u8 download @@ -175,13 +172,12 @@ class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): 'info_dict': { 'id': '727589', 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, + 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance From This Judicial Injustice!", + 'description': 'md5:cbcf05e528124b0f3a0a419fc805ea7d', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Kadokawa Pictures Inc.', - 'timestamp': 1484130900, - 'upload_date': '20170111', - 'series': compat_str, + 'upload_date': '20170118', + 'series': "KONOSUBA -God's blessing on this wonderful world!", 'season': "KONOSUBA -God's blessing on this wonderful world! 2", 'season_number': 2, 'episode': 'Give Me Deliverance From This Judicial Injustice!', @@ -204,11 +200,10 @@ class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): 'info_dict': { 'id': '535080', 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, + 'title': '11eyes Episode 1 – Red Night ~ Piros éjszaka', + 'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".', 'uploader': 'Marvelous AQL Inc.', - 'timestamp': 1255512600, - 'upload_date': '20091014', + 'upload_date': '20091021', }, 'params': { # Just test metadata extraction @@ -229,17 +224,15 @@ class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): # just test metadata extraction 'skip_download': True, }, - 'skip': 'Video gone', }, { # A video with a vastly different season name compared to the series name 'url': 'http://www.crunchyroll.com/nyarko-san-another-crawling-chaos/episode-1-test-590532', 'info_dict': { 'id': '590532', 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, + 'title': 'Haiyoru! Nyaruani (ONA) Episode 1 – Test', + 'description': 'Mahiro and Nyaruko talk about official certification.', 'uploader': 'TV TOKYO', - 'timestamp': 1330956000, 'upload_date': '20120305', 'series': 'Nyarko-san: Another Crawling Chaos', 'season': 'Haiyoru! Nyaruani (ONA)', @@ -449,21 +442,23 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage, 'language', default=None, group='lang') video_title = self._html_search_regex( - (r'(?s)]*>((?:(?!]+itemprop=["\']title["\']|meta[^>]+itemprop=["\']position["\'])[^>]*>(?:(?!', - r'(.+?),\s+-\s+.+? Crunchyroll'), - webpage, 'video_title', default=None) - if not video_title: - video_title = re.sub(r'^Watch\s+', '', self._og_search_description(webpage)) + r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>', + webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) video_description = (self._parse_json(self._html_search_regex( r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id, webpage, 'description', default='{}'), video_id) or media_metadata).get('description') if video_description: video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) + video_upload_date = self._html_search_regex( + [r'<div>Availability for free users:(.+?)</div>', r'<div>[^<>]+<span>\s*(.+?\d{4})\s*</span></div>'], + webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) + if video_upload_date: + video_upload_date = unified_strdate(video_upload_date) video_uploader = self._html_search_regex( # try looking for both an uploader that's a link and one that's not [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'], - webpage, 'video_uploader', default=False) + webpage, 'video_uploader', fatal=False) formats = [] for stream in media.get('streams', []): @@ -616,15 +611,14 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)', webpage, 'season number', default=None)) - info = self._search_json_ld(webpage, video_id, default={}) - - return merge_dicts({ + return { 'id': video_id, 'title': video_title, 'description': video_description, 'duration': duration, 'thumbnail': thumbnail, 'uploader': video_uploader, + 'upload_date': video_upload_date, 'series': series, 'season': season, 'season_number': season_number, @@ -632,7 +626,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'episode_number': episode_number, 'subtitles': subtitles, 'formats': formats, - }, info) + } class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index b8529050c4..327fdb04a7 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -32,7 +32,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): @staticmethod def _get_cookie_value(cookies, name): - cookie = cookies.get(name) + cookie = cookies.get('name') if cookie: return cookie.value diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index e700f8d865..04ff214f72 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -16,11 +16,10 @@ class DctpTvIE(InfoExtractor): _TESTS = [{ # 4x3 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', - 'md5': '3ffbd1556c3fe210724d7088fad723e3', 'info_dict': { 'id': '95eaa4f33dad413aa17b4ee613cccc6c', 'display_id': 'videoinstallation-fuer-eine-kaufhausfassade', - 'ext': 'm4v', + 'ext': 'flv', 'title': 'Videoinstallation für eine Kaufhausfassade', 'description': 'Kurzfilm', 'thumbnail': r're:^https?://.*\.jpg$', @@ -28,6 +27,10 @@ class DctpTvIE(InfoExtractor): 'timestamp': 1302172322, 'upload_date': '20110407', }, + 'params': { + # rtmp download + 'skip_download': True, + }, }, { # 16x9 'url': 'http://www.dctp.tv/filme/sind-youtuber-die-besseren-lehrer/', @@ -56,26 +59,33 @@ class DctpTvIE(InfoExtractor): uuid = media['uuid'] title = media['title'] - is_wide = media.get('is_wide') - formats = [] + ratio = '16x9' if media.get('is_wide') else '4x3' + play_path = 'mp4:%s_dctp_0500_%s.m4v' % (uuid, ratio) - def add_formats(suffix): - templ = 'https://%%s/%s_dctp_%s.m4v' % (uuid, suffix) - formats.extend([{ - 'format_id': 'hls-' + suffix, - 'url': templ % 'cdn-segments.dctp.tv' + '/playlist.m3u8', - 'protocol': 'm3u8_native', - }, { - 'format_id': 's3-' + suffix, - 'url': templ % 'completed-media.s3.amazonaws.com', - }, { - 'format_id': 'http-' + suffix, - 'url': templ % 'cdn-media.dctp.tv', - }]) + servers = self._download_json( + 'http://www.dctp.tv/streaming_servers/', display_id, + note='Downloading server list JSON', fatal=False) - add_formats('0500_' + ('16x9' if is_wide else '4x3')) - if is_wide: - add_formats('720p') + if servers: + endpoint = next( + server['endpoint'] + for server in servers + if url_or_none(server.get('endpoint')) + and 'cloudfront' in server['endpoint']) + else: + endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/' + + app = self._search_regex( + r'^rtmpe?://[^/]+/(?P<app>.*)$', endpoint, 'app') + + formats = [{ + 'url': endpoint, + 'app': app, + 'play_path': play_path, + 'page_url': url, + 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-110.swf', + 'ext': 'flv', + }] thumbnails = [] images = media.get('images') diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index e0139cc862..6a2712cc50 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -13,8 +13,8 @@ from ..compat import compat_HTTPError class DiscoveryIE(DiscoveryGoBaseIE): _VALID_URL = r'''(?x)https?:// (?P<site> - go\.discovery| - www\. + (?:(?:www|go)\.)?discovery| + (?:www\.)? (?: investigationdiscovery| discoverylife| @@ -22,7 +22,8 @@ class DiscoveryIE(DiscoveryGoBaseIE): ahctv| destinationamerica| sciencechannel| - tlc + tlc| + velocity )| watch\. (?: @@ -82,7 +83,7 @@ class DiscoveryIE(DiscoveryGoBaseIE): 'authRel': 'authorization', 'client_id': '3020a40c2356a645b4b4', 'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), - 'redirectUri': 'https://www.discovery.com/', + 'redirectUri': 'https://fusion.ddmcdn.com/app/mercury-sdk/180/redirectHandler.html?https://www.%s.com' % site, })['access_token'] headers = self.geo_verification_headers() diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index fe42821c73..c050bf9df3 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( encode_base_n, ExtractorError, @@ -54,7 +55,7 @@ class EpornerIE(InfoExtractor): webpage, urlh = self._download_webpage_handle(url, display_id) - video_id = self._match_id(urlh.geturl()) + video_id = self._match_id(compat_str(urlh.geturl())) hash = self._search_regex( r'hash\s*:\s*["\']([\da-f]{32})', webpage, 'hash') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3d3dae7a4d..50f69f0b6c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -105,7 +105,6 @@ from .bilibili import ( BiliBiliBangumiIE, BilibiliAudioIE, BilibiliAudioAlbumIE, - BiliBiliPlayerIE, ) from .biobiochiletv import BioBioChileTVIE from .bitchute import ( @@ -498,6 +497,7 @@ from .jeuxvideo import JeuxVideoIE from .jove import JoveIE from .joj import JojIE from .jwplatform import JWPlatformIE +from .jpopsukitv import JpopsukiIE from .kakao import KakaoIE from .kaltura import KalturaIE from .kanalplay import KanalPlayIE @@ -636,10 +636,7 @@ from .mixcloud import ( from .mlb import MLBIE from .mnet import MnetIE from .moevideo import MoeVideoIE -from .mofosex import ( - MofosexIE, - MofosexEmbedIE, -) +from .mofosex import MofosexIE from .mojvideo import MojvideoIE from .morningstar import MorningstarIE from .motherless import ( @@ -804,16 +801,6 @@ from .orf import ( ORFFM4IE, ORFFM4StoryIE, ORFOE1IE, - ORFOE3IE, - ORFNOEIE, - ORFWIEIE, - ORFBGLIE, - ORFOOEIE, - ORFSTMIE, - ORFKTNIE, - ORFSBGIE, - ORFTIRIE, - ORFVBGIE, ORFIPTVIE, ) from .outsidetv import OutsideTVIE @@ -821,6 +808,7 @@ from .packtpub import ( PacktPubIE, PacktPubCourseIE, ) +from .pandatv import PandaTVIE from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE @@ -863,7 +851,6 @@ from .polskieradio import ( PolskieRadioIE, PolskieRadioCategoryIE, ) -from .popcorntimes import PopcorntimesIE from .popcorntv import PopcornTVIE from .porn91 import Porn91IE from .porncom import PornComIE @@ -976,10 +963,7 @@ from .savefrom import SaveFromIE from .sbs import SBSIE from .screencast import ScreencastIE from .screencastomatic import ScreencastOMaticIE -from .scrippsnetworks import ( - ScrippsNetworksWatchIE, - ScrippsNetworksIE, -) +from .scrippsnetworks import ScrippsNetworksWatchIE from .scte import ( SCTEIE, SCTECourseIE, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 610d667459..ce64e26831 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -466,18 +466,15 @@ class FacebookIE(InfoExtractor): return info_dict if '/posts/' in url: - video_id_json = self._search_regex( - r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])', webpage, 'video ids', group='ids', - default='') - if video_id_json: - entries = [ - self.url_result('facebook:%s' % vid, FacebookIE.ie_key()) - for vid in self._parse_json(video_id_json, video_id)] - return self.playlist_result(entries, video_id) + entries = [ + self.url_result('facebook:%s' % vid, FacebookIE.ie_key()) + for vid in self._parse_json( + self._search_regex( + r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])', + webpage, 'video ids', group='ids'), + video_id)] - # Single Video? - video_id = self._search_regex(r'video_id:\s*"([0-9]+)"', webpage, 'single video id') - return self.url_result('facebook:%s' % video_id, FacebookIE.ie_key()) + return self.playlist_result(entries, video_id) else: _, info_dict = self._extract_from_url( self._VIDEO_PAGE_TEMPLATE % video_id, diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index 306b45fc99..b8fa175880 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -31,13 +31,7 @@ class FranceCultureIE(InfoExtractor): webpage = self._download_webpage(url, display_id) video_data = extract_attributes(self._search_regex( - r'''(?sx) - (?: - </h1>| - <div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*> - ).*? - (<button[^>]+data-asset-source="[^"]+"[^>]+>) - ''', + r'(?s)<div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>.*?(<button[^>]+data-asset-source="[^"]+"[^>]+>)', webpage, 'video data')) video_url = video_data['data-asset-source'] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 355067a509..743ef47dbe 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -60,9 +60,6 @@ from .tnaflix import TNAFlixNetworkEmbedIE from .drtuber import DrTuberIE from .redtube import RedTubeIE from .tube8 import Tube8IE -from .mofosex import MofosexEmbedIE -from .spankwire import SpankwireIE -from .youporn import YouPornIE from .vimeo import VimeoIE from .dailymotion import DailymotionIE from .dailymail import DailyMailIE @@ -1708,15 +1705,6 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, - { - # multiple kaltura embeds, nsfw - 'url': 'https://www.quartier-rouge.be/prive/femmes/kamila-avec-video-jaime-sadomie.html', - 'info_dict': { - 'id': 'kamila-avec-video-jaime-sadomie', - 'title': "Kamila avec vídeo “J'aime sadomie”", - }, - 'playlist_count': 8, - }, { # Non-standard Vimeo embed 'url': 'https://openclassrooms.com/courses/understanding-the-web', @@ -2110,9 +2098,6 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Smoky Barbecue Favorites', 'thumbnail': r're:^https?://.*\.jpe?g', - 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', - 'upload_date': '20170909', - 'timestamp': 1504915200, }, 'add_ie': [ZypeIE.ie_key()], 'params': { @@ -2299,7 +2284,7 @@ class GenericIE(InfoExtractor): if head_response is not False: # Check for redirect - new_url = head_response.geturl() + new_url = compat_str(head_response.geturl()) if url != new_url: self.report_following_redirect(new_url) if force_videoid: @@ -2399,12 +2384,12 @@ class GenericIE(InfoExtractor): return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, - xspf_base_url=full_response.geturl()), + xspf_base_url=compat_str(full_response.geturl())), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( doc, - mpd_base_url=full_response.geturl().rpartition('/')[0], + mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0], mpd_url=url) self._sort_formats(info_dict['formats']) return info_dict @@ -2548,21 +2533,15 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) - # Look for Teachable embeds, must be before Wistia - teachable_url = TeachableIE._extract_url(webpage, url) - if teachable_url: - return self.url_result(teachable_url) - # Look for embedded Wistia player - wistia_urls = WistiaIE._extract_urls(webpage) - if wistia_urls: - playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key()) - for entry in playlist['entries']: - entry.update({ - '_type': 'url_transparent', - 'uploader': video_uploader, - }) - return playlist + wistia_url = WistiaIE._extract_url(webpage) + if wistia_url: + return { + '_type': 'url_transparent', + 'url': self._proto_relative_url(wistia_url), + 'ie_key': WistiaIE.ie_key(), + 'uploader': video_uploader, + } # Look for SVT player svt_url = SVTIE._extract_url(webpage) @@ -2727,21 +2706,6 @@ class GenericIE(InfoExtractor): if tube8_urls: return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) - # Look for embedded Mofosex player - mofosex_urls = MofosexEmbedIE._extract_urls(webpage) - if mofosex_urls: - return self.playlist_from_matches(mofosex_urls, video_id, video_title, ie=MofosexEmbedIE.ie_key()) - - # Look for embedded Spankwire player - spankwire_urls = SpankwireIE._extract_urls(webpage) - if spankwire_urls: - return self.playlist_from_matches(spankwire_urls, video_id, video_title, ie=SpankwireIE.ie_key()) - - # Look for embedded YouPorn player - youporn_urls = YouPornIE._extract_urls(webpage) - if youporn_urls: - return self.playlist_from_matches(youporn_urls, video_id, video_title, ie=YouPornIE.ie_key()) - # Look for embedded Tvigle player mobj = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) @@ -2853,12 +2817,9 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'Zapiks') # Look for Kaltura embeds - kaltura_urls = KalturaIE._extract_urls(webpage) - if kaltura_urls: - return self.playlist_from_matches( - kaltura_urls, video_id, video_title, - getter=lambda x: smuggle_url(x, {'source_url': url}), - ie=KalturaIE.ie_key()) + kaltura_url = KalturaIE._extract_url(webpage) + if kaltura_url: + return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) # Look for EaglePlatform embeds eagleplatform_url = EaglePlatformIE._extract_url(webpage) @@ -2999,7 +2960,7 @@ class GenericIE(InfoExtractor): # Look for VODPlatform embeds mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/.+?)\1', + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vod-platform\.net/[eE]mbed/.+?)\1', webpage) if mobj is not None: return self.url_result( @@ -3176,6 +3137,10 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) + teachable_url = TeachableIE._extract_url(webpage, url) + if teachable_url: + return self.url_result(teachable_url) + indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) if indavideo_urls: return self.playlist_from_matches( diff --git a/youtube_dl/extractor/giantbomb.py b/youtube_dl/extractor/giantbomb.py index c6477958d2..6a1b1e96eb 100644 --- a/youtube_dl/extractor/giantbomb.py +++ b/youtube_dl/extractor/giantbomb.py @@ -13,10 +13,10 @@ from ..utils import ( class GiantBombIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?giantbomb\.com/(?:videos|shows)/(?P<display_id>[^/]+)/(?P<id>\d+-\d+)' - _TESTS = [{ + _VALID_URL = r'https?://(?:www\.)?giantbomb\.com/videos/(?P<display_id>[^/]+)/(?P<id>\d+-\d+)' + _TEST = { 'url': 'http://www.giantbomb.com/videos/quick-look-destiny-the-dark-below/2300-9782/', - 'md5': '132f5a803e7e0ab0e274d84bda1e77ae', + 'md5': 'c8ea694254a59246a42831155dec57ac', 'info_dict': { 'id': '2300-9782', 'display_id': 'quick-look-destiny-the-dark-below', @@ -26,10 +26,7 @@ class GiantBombIE(InfoExtractor): 'duration': 2399, 'thumbnail': r're:^https?://.*\.jpg$', } - }, { - 'url': 'https://www.giantbomb.com/shows/ben-stranding/2970-20212', - 'only_matching': True, - }] + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/hellporno.py b/youtube_dl/extractor/hellporno.py index fae4251034..0ee8ea712c 100644 --- a/youtube_dl/extractor/hellporno.py +++ b/youtube_dl/extractor/hellporno.py @@ -1,11 +1,12 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( - int_or_none, - merge_dicts, + js_to_json, remove_end, - unified_timestamp, + determine_ext, ) @@ -13,21 +14,15 @@ class HellPornoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?hellporno\.(?:com/videos|net/v)/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://hellporno.com/videos/dixie-is-posing-with-naked-ass-very-erotic/', - 'md5': 'f0a46ebc0bed0c72ae8fe4629f7de5f3', + 'md5': '1fee339c610d2049699ef2aa699439f1', 'info_dict': { 'id': '149116', 'display_id': 'dixie-is-posing-with-naked-ass-very-erotic', 'ext': 'mp4', 'title': 'Dixie is posing with naked ass very erotic', - 'description': 'md5:9a72922749354edb1c4b6e540ad3d215', - 'categories': list, 'thumbnail': r're:https?://.*\.jpg$', - 'duration': 240, - 'timestamp': 1398762720, - 'upload_date': '20140429', - 'view_count': int, 'age_limit': 18, - }, + } }, { 'url': 'http://hellporno.net/v/186271/', 'only_matching': True, @@ -41,36 +36,40 @@ class HellPornoIE(InfoExtractor): title = remove_end(self._html_search_regex( r'<title>([^<]+)', webpage, 'title'), ' - Hell Porno') - info = self._parse_html5_media_entries(url, webpage, display_id)[0] - self._sort_formats(info['formats']) + flashvars = self._parse_json(self._search_regex( + r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flashvars'), + display_id, transform_source=js_to_json) - video_id = self._search_regex( - (r'chs_object\s*=\s*["\'](\d+)', - r'params\[["\']video_id["\']\]\s*=\s*(\d+)'), webpage, 'video id', - default=display_id) - description = self._search_regex( - r'class=["\']desc_video_view_v2[^>]+>([^<]+)', webpage, - 'description', fatal=False) - categories = [ - c.strip() - for c in self._html_search_meta( - 'keywords', webpage, 'categories', default='').split(',') - if c.strip()] - duration = int_or_none(self._og_search_property( - 'video:duration', webpage, fatal=False)) - timestamp = unified_timestamp(self._og_search_property( - 'video:release_date', webpage, fatal=False)) - view_count = int_or_none(self._search_regex( - r'>Views\s+(\d+)', webpage, 'view count', fatal=False)) + video_id = flashvars.get('video_id') + thumbnail = flashvars.get('preview_url') + ext = determine_ext(flashvars.get('postfix'), 'mp4') - return merge_dicts(info, { + formats = [] + for video_url_key in ['video_url', 'video_alt_url']: + video_url = flashvars.get(video_url_key) + if not video_url: + continue + video_text = flashvars.get('%s_text' % video_url_key) + fmt = { + 'url': video_url, + 'ext': ext, + 'format_id': video_text, + } + m = re.search(r'^(?P\d+)[pP]', video_text) + if m: + fmt['height'] = int(m.group('height')) + formats.append(fmt) + self._sort_formats(formats) + + categories = self._html_search_meta( + 'keywords', webpage, 'categories', default='').split(',') + + return { 'id': video_id, 'display_id': display_id, 'title': title, - 'description': description, + 'thumbnail': thumbnail, 'categories': categories, - 'duration': duration, - 'timestamp': timestamp, - 'view_count': view_count, 'age_limit': 18, - }) + 'formats': formats, + } diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index a31301985b..436759da54 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import base64 -import json import re from .common import InfoExtractor @@ -10,7 +8,6 @@ from ..utils import ( mimetype2ext, parse_duration, qualities, - try_get, url_or_none, ) @@ -18,16 +15,15 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).*?[/-]vi(?P\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).+?[/-]vi(?P\d+)' _TESTS = [{ 'url': 'http://www.imdb.com/video/imdb/vi2524815897', 'info_dict': { 'id': '2524815897', 'ext': 'mp4', - 'title': 'No. 2', + 'title': 'No. 2 from Ice Age: Continental Drift (2012)', 'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7', - 'duration': 152, } }, { 'url': 'http://www.imdb.com/video/_/vi2524815897', @@ -51,23 +47,21 @@ class ImdbIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - data = self._download_json( - 'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id, - query={ - 'key': base64.b64encode(json.dumps({ - 'type': 'VIDEO_PLAYER', - 'subType': 'FORCE_LEGACY', - 'id': 'vi%s' % video_id, - }).encode()).decode(), - })[0] + webpage = self._download_webpage( + 'https://www.imdb.com/videoplayer/vi' + video_id, video_id) + video_metadata = self._parse_json(self._search_regex( + r'window\.IMDbReactInitialState\.push\(({.+?})\);', webpage, + 'video metadata'), video_id)['videos']['videoMetadata']['vi' + video_id] + title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage) or self._html_search_regex( + r'(.+?)', webpage, 'title', fatal=False) or video_metadata['title'] quality = qualities(('SD', '480p', '720p', '1080p')) formats = [] - for encoding in data['videoLegacyEncodings']: + for encoding in video_metadata.get('encodings', []): if not encoding or not isinstance(encoding, dict): continue - video_url = url_or_none(encoding.get('url')) + video_url = url_or_none(encoding.get('videoUrl')) if not video_url: continue ext = mimetype2ext(encoding.get( @@ -75,7 +69,7 @@ class ImdbIE(InfoExtractor): if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', entry_protocol='m3u8_native', - preference=1, m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False)) continue format_id = encoding.get('definition') formats.append({ @@ -86,33 +80,13 @@ class ImdbIE(InfoExtractor): }) self._sort_formats(formats) - webpage = self._download_webpage( - 'https://www.imdb.com/video/vi' + video_id, video_id) - video_metadata = self._parse_json(self._search_regex( - r'args\.push\(\s*({.+?})\s*\)\s*;', webpage, - 'video metadata'), video_id) - - video_info = video_metadata.get('VIDEO_INFO') - if video_info and isinstance(video_info, dict): - info = try_get( - video_info, lambda x: x[list(video_info.keys())[0]][0], dict) - else: - info = {} - - title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage) or self._html_search_regex( - r'(.+?)', webpage, 'title', - default=None) or info['videoTitle'] - return { 'id': video_id, 'title': title, - 'alt_title': info.get('videoSubTitle'), 'formats': formats, - 'description': info.get('videoDescription'), - 'thumbnail': url_or_none(try_get( - video_metadata, lambda x: x['videoSlate']['source'])), - 'duration': parse_duration(info.get('videoRuntime')), + 'description': video_metadata.get('description'), + 'thumbnail': video_metadata.get('slate', {}).get('url'), + 'duration': parse_duration(video_metadata.get('duration')), } diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 4c16243ec1..2b5b2b5b0b 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -58,7 +58,7 @@ class IndavideoEmbedIE(InfoExtractor): video_id = self._match_id(url) video = self._download_json( - 'https://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/%s' % video_id, + 'http://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/%s' % video_id, video_id)['data'] title = video['title'] diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 53a550c11e..11bbeb5922 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -16,22 +16,12 @@ class IPrimaIE(InfoExtractor): _GEO_BYPASS = False _TESTS = [{ - 'url': 'https://prima.iprima.cz/particka/92-epizoda', + 'url': 'http://play.iprima.cz/gondici-s-r-o-33', 'info_dict': { - 'id': 'p51388', + 'id': 'p136534', 'ext': 'mp4', - 'title': 'Partička (92)', - 'description': 'md5:859d53beae4609e6dd7796413f1b6cac', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - }, { - 'url': 'https://cnn.iprima.cz/videa/70-epizoda', - 'info_dict': { - 'id': 'p681554', - 'ext': 'mp4', - 'title': 'HLAVNÍ ZPRÁVY 3.5.2020', + 'title': 'Gondíci s. r. o. (34)', + 'description': 'md5:16577c629d006aa91f59ca8d8e7f99bd', }, 'params': { 'skip_download': True, # m3u8 download @@ -78,15 +68,9 @@ class IPrimaIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._og_search_title( - webpage, default=None) or self._search_regex( - r'

([^<]+)', webpage, 'title') - video_id = self._search_regex( (r']+\bsrc=["\'](?:https?:)?//(?:api\.play-backend\.iprima\.cz/prehravac/embedded|prima\.iprima\.cz/[^/]+/[^/]+)\?.*?\bid=(p\d+)', - r'data-product="([^"]+)">', - r'id=["\']player-(p\d+)"', - r'playerId\s*:\s*["\']player-(p\d+)'), + r'data-product="([^"]+)">'), webpage, 'real id') playerpage = self._download_webpage( @@ -141,8 +125,8 @@ class IPrimaIE(InfoExtractor): return { 'id': video_id, - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, - 'description': self._og_search_description(webpage, default=None), + 'description': self._og_search_description(webpage), } diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index b5a740a01e..a502e88066 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -239,7 +239,7 @@ class IviCompilationIE(InfoExtractor): self.url_result( 'http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), IviIE.ie_key()) for serie in re.findall( - r']+\bhref=["\']/watch/%s/(\d+)["\']' % compilation_id, html)] + r']+data-id="\1"' % compilation_id, html)] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/jpopsukitv.py b/youtube_dl/extractor/jpopsukitv.py new file mode 100644 index 0000000000..4b5f346d1e --- /dev/null +++ b/youtube_dl/extractor/jpopsukitv.py @@ -0,0 +1,68 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, +) + + +class JpopsukiIE(InfoExtractor): + IE_NAME = 'jpopsuki.tv' + _VALID_URL = r'https?://(?:www\.)?jpopsuki\.tv/(?:category/)?video/[^/]+/(?P\S+)' + + _TEST = { + 'url': 'http://www.jpopsuki.tv/video/ayumi-hamasaki---evolution/00be659d23b0b40508169cdee4545771', + 'md5': '88018c0c1a9b1387940e90ec9e7e198e', + 'info_dict': { + 'id': '00be659d23b0b40508169cdee4545771', + 'ext': 'mp4', + 'title': 'ayumi hamasaki - evolution', + 'description': 'Release date: 2001.01.31\r\n浜崎あゆみ - evolution', + 'thumbnail': 'http://www.jpopsuki.tv/cache/89722c74d2a2ebe58bcac65321c115b2.jpg', + 'uploader': 'plama_chan', + 'uploader_id': '404', + 'upload_date': '20121101' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = 'http://www.jpopsuki.tv' + self._html_search_regex( + r'from: uploaded: (.*?)', webpage, 'video upload_date', + fatal=False)) + view_count_str = self._html_search_regex( + r'
  • Hits: ([0-9]+?)
  • ', webpage, 'video view_count', + fatal=False) + comment_count_str = self._html_search_regex( + r'

    ([0-9]+?) comments

    ', webpage, 'video comment_count', + fatal=False) + + return { + 'id': video_id, + 'url': video_url, + 'title': video_title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'view_count': int_or_none(view_count_str), + 'comment_count': int_or_none(comment_count_str), + } diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index c34b5f5e6b..2aabd98b5b 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import unsmuggle_url class JWPlatformIE(InfoExtractor): @@ -33,14 +32,10 @@ class JWPlatformIE(InfoExtractor): @staticmethod def _extract_urls(webpage): return re.findall( - r'<(?:script|iframe)[^>]+?src=["\']((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})', + r'<(?:script|iframe)[^>]+?src=["\']((?:https?:)?//content\.jwplatform\.com/players/[a-zA-Z0-9]{8})', webpage) def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - self._initialize_geo_bypass({ - 'countries': smuggled_data.get('geo_countries'), - }) video_id = self._match_id(url) json_data = self._download_json('https://cdn.jwplayer.com/v2/media/' + video_id, video_id) return self._parse_jwplayer_data(json_data, video_id) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 49d13460df..2d38b758b7 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -113,14 +113,9 @@ class KalturaIE(InfoExtractor): @staticmethod def _extract_url(webpage): - urls = KalturaIE._extract_urls(webpage) - return urls[0] if urls else None - - @staticmethod - def _extract_urls(webpage): # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site - finditer = ( - re.finditer( + mobj = ( + re.search( r"""(?xs) kWidget\.(?:thumb)?[Ee]mbed\( \{.*? @@ -129,7 +124,7 @@ class KalturaIE(InfoExtractor): (?P['"])entry_?[Ii]d(?P=q3)\s*:\s* (?P['"])(?P(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\}) """, webpage) - or re.finditer( + or re.search( r'''(?xs) (?P["']) (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+)(?:(?!(?P=q1)).)* @@ -143,7 +138,7 @@ class KalturaIE(InfoExtractor): ) (?P["'])(?P(?:(?!(?P=q3)).)+)(?P=q3) ''', webpage) - or re.finditer( + or re.search( r'''(?xs) <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["']) (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+) @@ -153,8 +148,7 @@ class KalturaIE(InfoExtractor): (?P=q1) ''', webpage) ) - urls = [] - for mobj in finditer: + if mobj: embed_info = mobj.groupdict() for k, v in embed_info.items(): if v: @@ -166,8 +160,7 @@ class KalturaIE(InfoExtractor): webpage) if service_mobj: url = smuggle_url(url, {'service_url': service_mobj.group('id')}) - urls.append(url) - return urls + return url def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs): params = actions[0] diff --git a/youtube_dl/extractor/lecturio.py b/youtube_dl/extractor/lecturio.py index 1b2dcef466..6ed7da4aba 100644 --- a/youtube_dl/extractor/lecturio.py +++ b/youtube_dl/extractor/lecturio.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( clean_html, determine_ext, @@ -35,7 +36,7 @@ class LecturioBaseIE(InfoExtractor): self._LOGIN_URL, None, 'Downloading login popup') def is_logged(url_handle): - return self._LOGIN_URL not in url_handle.geturl() + return self._LOGIN_URL not in compat_str(url_handle.geturl()) # Already logged in if is_logged(urlh): diff --git a/youtube_dl/extractor/lego.py b/youtube_dl/extractor/lego.py index 1e3c19dfd6..b312e77f1a 100644 --- a/youtube_dl/extractor/lego.py +++ b/youtube_dl/extractor/lego.py @@ -2,24 +2,23 @@ from __future__ import unicode_literals import re -import uuid from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..compat import compat_str from ..utils import ( - ExtractorError, - int_or_none, - qualities, + unescapeHTML, + parse_duration, + get_element_by_class, ) class LEGOIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?lego\.com/(?P[a-z]{2}-[a-z]{2})/(?:[^/]+/)*videos/(?:[^/]+/)*[^/?#]+-(?P[0-9a-f]{32})' + _VALID_URL = r'https?://(?:www\.)?lego\.com/(?P[^/]+)/(?:[^/]+/)*videos/(?:[^/]+/)*[^/?#]+-(?P[0-9a-f]+)' _TESTS = [{ 'url': 'http://www.lego.com/en-us/videos/themes/club/blocumentary-kawaguchi-55492d823b1b4d5e985787fa8c2973b1', 'md5': 'f34468f176cfd76488767fc162c405fa', 'info_dict': { - 'id': '55492d82-3b1b-4d5e-9857-87fa8c2973b1_en-US', + 'id': '55492d823b1b4d5e985787fa8c2973b1', 'ext': 'mp4', 'title': 'Blocumentary Great Creations: Akiyuki Kawaguchi', 'description': 'Blocumentary Great Creations: Akiyuki Kawaguchi', @@ -27,123 +26,103 @@ class LEGOIE(InfoExtractor): }, { # geo-restricted but the contentUrl contain a valid url 'url': 'http://www.lego.com/nl-nl/videos/themes/nexoknights/episode-20-kingdom-of-heroes-13bdc2299ab24d9685701a915b3d71e7##sp=399', - 'md5': 'c7420221f7ffd03ff056f9db7f8d807c', + 'md5': '4c3fec48a12e40c6e5995abc3d36cc2e', 'info_dict': { - 'id': '13bdc229-9ab2-4d96-8570-1a915b3d71e7_nl-NL', + 'id': '13bdc2299ab24d9685701a915b3d71e7', 'ext': 'mp4', - 'title': 'Aflevering 20: Helden van het koninkrijk', + 'title': 'Aflevering 20 - Helden van het koninkrijk', 'description': 'md5:8ee499aac26d7fa8bcb0cedb7f9c3941', - 'age_limit': 5, }, }, { - # with subtitle - 'url': 'https://www.lego.com/nl-nl/kids/videos/classic/creative-storytelling-the-little-puppy-aa24f27c7d5242bc86102ebdc0f24cba', + # special characters in title + 'url': 'http://www.lego.com/en-us/starwars/videos/lego-star-wars-force-surprise-9685ee9d12e84ff38e84b4e3d0db533d', 'info_dict': { - 'id': 'aa24f27c-7d52-42bc-8610-2ebdc0f24cba_nl-NL', + 'id': '9685ee9d12e84ff38e84b4e3d0db533d', 'ext': 'mp4', - 'title': 'De kleine puppy', - 'description': 'md5:5b725471f849348ac73f2e12cfb4be06', - 'age_limit': 1, - 'subtitles': { - 'nl': [{ - 'ext': 'srt', - 'url': r're:^https://.+\.srt$', - }], - }, + 'title': 'Force Surprise – LEGO® Star Wars™ Microfighters', + 'description': 'md5:9c673c96ce6f6271b88563fe9dc56de3', }, 'params': { 'skip_download': True, }, }] - _QUALITIES = { - 'Lowest': (64, 180, 320), - 'Low': (64, 270, 480), - 'Medium': (96, 360, 640), - 'High': (128, 540, 960), - 'Highest': (128, 720, 1280), - } + _BITRATES = [256, 512, 1024, 1536, 2560] def _real_extract(self, url): locale, video_id = re.match(self._VALID_URL, url).groups() - countries = [locale.split('-')[1].upper()] - self._initialize_geo_bypass({ - 'countries': countries, - }) + webpage = self._download_webpage(url, video_id) + title = get_element_by_class('video-header', webpage).strip() + progressive_base = 'https://lc-mediaplayerns-live-s.legocdn.com/' + streaming_base = 'http://legoprod-f.akamaihd.net/' + content_url = self._html_search_meta('contentUrl', webpage) + path = self._search_regex( + r'(?:https?:)?//[^/]+/(?:[iz]/s/)?public/(.+)_[0-9,]+\.(?:mp4|webm)', + content_url, 'video path', default=None) + if not path: + player_url = self._proto_relative_url(self._search_regex( + r']+src="((?:https?)?//(?:www\.)?lego\.com/[^/]+/mediaplayer/video/[^"]+)', + webpage, 'player url', default=None)) + if not player_url: + base_url = self._proto_relative_url(self._search_regex( + r'data-baseurl="([^"]+)"', webpage, 'base url', + default='http://www.lego.com/%s/mediaplayer/video/' % locale)) + player_url = base_url + video_id + player_webpage = self._download_webpage(player_url, video_id) + video_data = self._parse_json(unescapeHTML(self._search_regex( + r"video='([^']+)'", player_webpage, 'video data')), video_id) + progressive_base = self._search_regex( + r'data-video-progressive-url="([^"]+)"', + player_webpage, 'progressive base', default='https://lc-mediaplayerns-live-s.legocdn.com/') + streaming_base = self._search_regex( + r'data-video-streaming-url="([^"]+)"', + player_webpage, 'streaming base', default='http://legoprod-f.akamaihd.net/') + item_id = video_data['ItemId'] - try: - item = self._download_json( - # https://contentfeed.services.lego.com/api/v2/item/[VIDEO_ID]?culture=[LOCALE]&contentType=Video - 'https://services.slingshot.lego.com/mediaplayer/v2', - video_id, query={ - 'videoId': '%s_%s' % (uuid.UUID(video_id), locale), - }, headers=self.geo_verification_headers()) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 451: - self.raise_geo_restricted(countries=countries) - raise + net_storage_path = video_data.get('NetStoragePath') or '/'.join([item_id[:2], item_id[2:4]]) + base_path = '_'.join([item_id, video_data['VideoId'], video_data['Locale'], compat_str(video_data['VideoVersion'])]) + path = '/'.join([net_storage_path, base_path]) + streaming_path = ','.join(map(lambda bitrate: compat_str(bitrate), self._BITRATES)) - video = item['Video'] - video_id = video['Id'] - title = video['Title'] - - q = qualities(['Lowest', 'Low', 'Medium', 'High', 'Highest']) - formats = [] - for video_source in item.get('VideoFormats', []): - video_source_url = video_source.get('Url') - if not video_source_url: - continue - video_source_format = video_source.get('Format') - if video_source_format == 'F4M': - formats.extend(self._extract_f4m_formats( - video_source_url, video_id, - f4m_id=video_source_format, fatal=False)) - elif video_source_format == 'M3U8': - formats.extend(self._extract_m3u8_formats( - video_source_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=video_source_format, fatal=False)) - else: - video_source_quality = video_source.get('Quality') - format_id = [] - for v in (video_source_format, video_source_quality): - if v: - format_id.append(v) - f = { - 'format_id': '-'.join(format_id), - 'quality': q(video_source_quality), - 'url': video_source_url, - } - quality = self._QUALITIES.get(video_source_quality) - if quality: - f.update({ - 'abr': quality[0], - 'height': quality[1], - 'width': quality[2], - }), - formats.append(f) - self._sort_formats(formats) - - subtitles = {} - sub_file_id = video.get('SubFileId') - if sub_file_id and sub_file_id != '00000000-0000-0000-0000-000000000000': - net_storage_path = video.get('NetstoragePath') - invariant_id = video.get('InvariantId') - video_file_id = video.get('VideoFileId') - video_version = video.get('VideoVersion') - if net_storage_path and invariant_id and video_file_id and video_version: - subtitles.setdefault(locale[:2], []).append({ - 'url': 'https://lc-mediaplayerns-live-s.legocdn.com/public/%s/%s_%s_%s_%s_sub.srt' % (net_storage_path, invariant_id, video_file_id, locale, video_version), + formats = self._extract_akamai_formats( + '%si/s/public/%s_,%s,.mp4.csmil/master.m3u8' % (streaming_base, path, streaming_path), video_id) + m3u8_formats = list(filter( + lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none', + formats)) + if len(m3u8_formats) == len(self._BITRATES): + self._sort_formats(m3u8_formats) + for bitrate, m3u8_format in zip(self._BITRATES, m3u8_formats): + progressive_base_url = '%spublic/%s_%d.' % (progressive_base, path, bitrate) + mp4_f = m3u8_format.copy() + mp4_f.update({ + 'url': progressive_base_url + 'mp4', + 'format_id': m3u8_format['format_id'].replace('hls', 'mp4'), + 'protocol': 'http', }) + web_f = { + 'url': progressive_base_url + 'webm', + 'format_id': m3u8_format['format_id'].replace('hls', 'webm'), + 'width': m3u8_format['width'], + 'height': m3u8_format['height'], + 'tbr': m3u8_format.get('tbr'), + 'ext': 'webm', + } + formats.extend([web_f, mp4_f]) + else: + for bitrate in self._BITRATES: + for ext in ('web', 'mp4'): + formats.append({ + 'format_id': '%s-%s' % (ext, bitrate), + 'url': '%spublic/%s_%d.%s' % (progressive_base, path, bitrate, ext), + 'tbr': bitrate, + 'ext': ext, + }) + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'description': video.get('Description'), - 'thumbnail': video.get('GeneratedCoverImage') or video.get('GeneratedThumbnail'), - 'duration': int_or_none(video.get('Length')), + 'description': self._html_search_meta('description', webpage), + 'thumbnail': self._html_search_meta('thumbnail', webpage), + 'duration': parse_duration(self._html_search_meta('duration', webpage)), 'formats': formats, - 'subtitles': subtitles, - 'age_limit': int_or_none(video.get('AgeFrom')), - 'season': video.get('SeasonTitle'), - 'season_number': int_or_none(video.get('Season')) or None, - 'episode_number': int_or_none(video.get('Episode')) or None, } diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 39f74d2822..729d8de50f 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -18,6 +18,7 @@ from ..utils import ( class LimelightBaseIE(InfoExtractor): _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' + _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json' @classmethod def _extract_urls(cls, webpage, source_url): @@ -69,8 +70,7 @@ class LimelightBaseIE(InfoExtractor): try: return self._download_json( self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method), - item_id, 'Downloading PlaylistService %s JSON' % method, - fatal=fatal, headers=headers) + item_id, 'Downloading PlaylistService %s JSON' % method, fatal=fatal, headers=headers) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: error = self._parse_json(e.cause.read().decode(), item_id)['detail']['contentAccessPermission'] @@ -79,22 +79,22 @@ class LimelightBaseIE(InfoExtractor): raise ExtractorError(error, expected=True) raise - def _extract(self, item_id, pc_method, mobile_method, referer=None): + def _call_api(self, organization_id, item_id, method): + return self._download_json( + self._API_URL % (organization_id, self._API_PATH, item_id, method), + item_id, 'Downloading API %s JSON' % method) + + def _extract(self, item_id, pc_method, mobile_method, meta_method, referer=None): pc = self._call_playlist_service(item_id, pc_method, referer=referer) - mobile = self._call_playlist_service( - item_id, mobile_method, fatal=False, referer=referer) - return pc, mobile - - def _extract_info(self, pc, mobile, i, referer): - get_item = lambda x, y: try_get(x, lambda x: x[y][i], dict) or {} - pc_item = get_item(pc, 'playlistItems') - mobile_item = get_item(mobile, 'mediaList') - video_id = pc_item.get('mediaId') or mobile_item['mediaId'] - title = pc_item.get('title') or mobile_item['title'] + metadata = self._call_api(pc['orgId'], item_id, meta_method) + mobile = self._call_playlist_service(item_id, mobile_method, fatal=False, referer=referer) + return pc, mobile, metadata + def _extract_info(self, streams, mobile_urls, properties): + video_id = properties['media_id'] formats = [] urls = [] - for stream in pc_item.get('streams', []): + for stream in streams: stream_url = stream.get('url') if not stream_url or stream.get('drmProtected') or stream_url in urls: continue @@ -155,7 +155,7 @@ class LimelightBaseIE(InfoExtractor): }) formats.append(fmt) - for mobile_url in mobile_item.get('mobileUrls', []): + for mobile_url in mobile_urls: media_url = mobile_url.get('mobileUrl') format_id = mobile_url.get('targetMediaPlatform') if not media_url or format_id in ('Widevine', 'SmoothStreaming') or media_url in urls: @@ -179,34 +179,54 @@ class LimelightBaseIE(InfoExtractor): self._sort_formats(formats) - subtitles = {} - for flag in mobile_item.get('flags'): - if flag == 'ClosedCaptions': - closed_captions = self._call_playlist_service( - video_id, 'getClosedCaptionsDetailsByMediaId', - False, referer) or [] - for cc in closed_captions: - cc_url = cc.get('webvttFileUrl') - if not cc_url: - continue - lang = cc.get('languageCode') or self._search_regex(r'/[a-z]{2}\.vtt', cc_url, 'lang', default='en') - subtitles.setdefault(lang, []).append({ - 'url': cc_url, - }) - break + title = properties['title'] + description = properties.get('description') + timestamp = int_or_none(properties.get('publish_date') or properties.get('create_date')) + duration = float_or_none(properties.get('duration_in_milliseconds'), 1000) + filesize = int_or_none(properties.get('total_storage_in_bytes')) + categories = [properties.get('category')] + tags = properties.get('tags', []) + thumbnails = [{ + 'url': thumbnail['url'], + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + } for thumbnail in properties.get('thumbnails', []) if thumbnail.get('url')] - get_meta = lambda x: pc_item.get(x) or mobile_item.get(x) + subtitles = {} + for caption in properties.get('captions', []): + lang = caption.get('language_code') + subtitles_url = caption.get('url') + if lang and subtitles_url: + subtitles.setdefault(lang, []).append({ + 'url': subtitles_url, + }) + closed_captions_url = properties.get('closed_captions_url') + if closed_captions_url: + subtitles.setdefault('en', []).append({ + 'url': closed_captions_url, + 'ext': 'ttml', + }) return { 'id': video_id, 'title': title, - 'description': get_meta('description'), + 'description': description, 'formats': formats, - 'duration': float_or_none(get_meta('durationInMilliseconds'), 1000), - 'thumbnail': get_meta('previewImageUrl') or get_meta('thumbnailImageUrl'), + 'timestamp': timestamp, + 'duration': duration, + 'filesize': filesize, + 'categories': categories, + 'tags': tags, + 'thumbnails': thumbnails, 'subtitles': subtitles, } + def _extract_info_helper(self, pc, mobile, i, metadata): + return self._extract_info( + try_get(pc, lambda x: x['playlistItems'][i]['streams'], list) or [], + try_get(mobile, lambda x: x['mediaList'][i]['mobileUrls'], list) or [], + metadata) + class LimelightMediaIE(LimelightBaseIE): IE_NAME = 'limelight' @@ -231,6 +251,8 @@ class LimelightMediaIE(LimelightBaseIE): 'description': 'md5:8005b944181778e313d95c1237ddb640', 'thumbnail': r're:^https?://.*\.jpeg$', 'duration': 144.23, + 'timestamp': 1244136834, + 'upload_date': '20090604', }, 'params': { # m3u8 download @@ -246,29 +268,30 @@ class LimelightMediaIE(LimelightBaseIE): 'title': '3Play Media Overview Video', 'thumbnail': r're:^https?://.*\.jpeg$', 'duration': 78.101, - # TODO: extract all languages that were accessible via API - # 'subtitles': 'mincount:9', - 'subtitles': 'mincount:1', + 'timestamp': 1338929955, + 'upload_date': '20120605', + 'subtitles': 'mincount:9', }, }, { 'url': 'https://assets.delvenetworks.com/player/loader.swf?mediaId=8018a574f08d416e95ceaccae4ba0452', 'only_matching': True, }] _PLAYLIST_SERVICE_PATH = 'media' + _API_PATH = 'media' def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) video_id = self._match_id(url) - source_url = smuggled_data.get('source_url') self._initialize_geo_bypass({ 'countries': smuggled_data.get('geo_countries'), }) - pc, mobile = self._extract( + pc, mobile, metadata = self._extract( video_id, 'getPlaylistByMediaId', - 'getMobilePlaylistByMediaId', source_url) + 'getMobilePlaylistByMediaId', 'properties', + smuggled_data.get('source_url')) - return self._extract_info(pc, mobile, 0, source_url) + return self._extract_info_helper(pc, mobile, 0, metadata) class LimelightChannelIE(LimelightBaseIE): @@ -290,7 +313,6 @@ class LimelightChannelIE(LimelightBaseIE): 'info_dict': { 'id': 'ab6a524c379342f9b23642917020c082', 'title': 'Javascript Sample Code', - 'description': 'Javascript Sample Code - http://www.delvenetworks.com/sample-code/playerCode-demo.html', }, 'playlist_mincount': 3, }, { @@ -298,23 +320,22 @@ class LimelightChannelIE(LimelightBaseIE): 'only_matching': True, }] _PLAYLIST_SERVICE_PATH = 'channel' + _API_PATH = 'channels' def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) channel_id = self._match_id(url) - source_url = smuggled_data.get('source_url') - pc, mobile = self._extract( + pc, mobile, medias = self._extract( channel_id, 'getPlaylistByChannelId', 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1', - source_url) + 'media', smuggled_data.get('source_url')) entries = [ - self._extract_info(pc, mobile, i, source_url) - for i in range(len(pc['playlistItems']))] + self._extract_info_helper(pc, mobile, i, medias['media_list'][i]) + for i in range(len(medias['media_list']))] - return self.playlist_result( - entries, channel_id, pc.get('title'), mobile.get('description')) + return self.playlist_result(entries, channel_id, pc['title']) class LimelightChannelListIE(LimelightBaseIE): @@ -347,12 +368,10 @@ class LimelightChannelListIE(LimelightBaseIE): def _real_extract(self, url): channel_list_id = self._match_id(url) - channel_list = self._call_playlist_service( - channel_list_id, 'getMobileChannelListById') + channel_list = self._call_playlist_service(channel_list_id, 'getMobileChannelListById') entries = [ self.url_result('limelight:channel:%s' % channel['id'], 'LimelightChannel') for channel in channel_list['channelList']] - return self.playlist_result( - entries, channel_list_id, channel_list['title']) + return self.playlist_result(entries, channel_list_id, channel_list['title']) diff --git a/youtube_dl/extractor/linuxacademy.py b/youtube_dl/extractor/linuxacademy.py index 23ca965d97..a78c6556e1 100644 --- a/youtube_dl/extractor/linuxacademy.py +++ b/youtube_dl/extractor/linuxacademy.py @@ -8,6 +8,7 @@ from .common import InfoExtractor from ..compat import ( compat_b64decode, compat_HTTPError, + compat_str, ) from ..utils import ( ExtractorError, @@ -98,7 +99,7 @@ class LinuxAcademyIE(InfoExtractor): 'sso': 'true', }) - login_state_url = urlh.geturl() + login_state_url = compat_str(urlh.geturl()) try: login_page = self._download_webpage( @@ -128,7 +129,7 @@ class LinuxAcademyIE(InfoExtractor): }) access_token = self._search_regex( - r'access_token=([^=&]+)', urlh.geturl(), + r'access_token=([^=&]+)', compat_str(urlh.geturl()), 'access token') self._download_webpage( diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index 65cc474db0..6b0e64b7f1 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -20,10 +20,10 @@ class MailRuIE(InfoExtractor): IE_DESC = 'Видео@Mail.Ru' _VALID_URL = r'''(?x) https?:// - (?:(?:www|m)\.)?my\.mail\.ru/+ + (?:(?:www|m)\.)?my\.mail\.ru/ (?: video/.*\#video=/?(?P(?:[^/]+/){3}\d+)| - (?:(?P(?:[^/]+/+){2})video/(?P[^/]+/\d+))\.html| + (?:(?P(?:[^/]+/){2})video/(?P[^/]+/\d+))\.html| (?:video/embed|\+/video/meta)/(?P\d+) ) ''' @@ -85,14 +85,6 @@ class MailRuIE(InfoExtractor): { 'url': 'http://my.mail.ru/+/video/meta/7949340477499637815', 'only_matching': True, - }, - { - 'url': 'https://my.mail.ru//list/sinyutin10/video/_myvideo/4.html', - 'only_matching': True, - }, - { - 'url': 'https://my.mail.ru//list//sinyutin10/video/_myvideo/4.html', - 'only_matching': True, } ] @@ -128,12 +120,6 @@ class MailRuIE(InfoExtractor): 'http://api.video.mail.ru/videos/%s.json?new=1' % video_id, video_id, 'Downloading video JSON') - headers = {} - - video_key = self._get_cookies('https://my.mail.ru').get('video_key') - if video_key: - headers['Cookie'] = 'video_key=%s' % video_key.value - formats = [] for f in video_data['videos']: video_url = f.get('url') @@ -146,7 +132,6 @@ class MailRuIE(InfoExtractor): 'url': video_url, 'format_id': format_id, 'height': height, - 'http_headers': headers, }) self._sort_formats(formats) @@ -252,7 +237,7 @@ class MailRuMusicSearchBaseIE(InfoExtractor): class MailRuMusicIE(MailRuMusicSearchBaseIE): IE_NAME = 'mailru:music' IE_DESC = 'Музыка@Mail.Ru' - _VALID_URL = r'https?://my\.mail\.ru/+music/+songs/+[^/?#&]+-(?P[\da-f]+)' + _VALID_URL = r'https?://my\.mail\.ru/music/songs/[^/?#&]+-(?P[\da-f]+)' _TESTS = [{ 'url': 'https://my.mail.ru/music/songs/%D0%BC8%D0%BB8%D1%82%D1%85-l-a-h-luciferian-aesthetics-of-herrschaft-single-2017-4e31f7125d0dfaef505d947642366893', 'md5': '0f8c22ef8c5d665b13ac709e63025610', @@ -288,7 +273,7 @@ class MailRuMusicIE(MailRuMusicSearchBaseIE): class MailRuMusicSearchIE(MailRuMusicSearchBaseIE): IE_NAME = 'mailru:music:search' IE_DESC = 'Музыка@Mail.Ru' - _VALID_URL = r'https?://my\.mail\.ru/+music/+search/+(?P[^/?#&]+)' + _VALID_URL = r'https?://my\.mail\.ru/music/search/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://my.mail.ru/music/search/black%20shadow', 'info_dict': { diff --git a/youtube_dl/extractor/malltv.py b/youtube_dl/extractor/malltv.py index 6f4fd927fa..e13c2e11a5 100644 --- a/youtube_dl/extractor/malltv.py +++ b/youtube_dl/extractor/malltv.py @@ -8,7 +8,7 @@ from ..utils import merge_dicts class MallTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|sk)\.)?mall\.tv/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?mall\.tv/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.mall.tv/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', 'md5': '1c4a37f080e1f3023103a7b43458e518', @@ -26,9 +26,6 @@ class MallTVIE(InfoExtractor): }, { 'url': 'https://www.mall.tv/kdo-to-plati/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', 'only_matching': True, - }, { - 'url': 'https://sk.mall.tv/gejmhaus/reklamacia-nehreje-vyrobnik-tepla-alebo-spekacka', - 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 933df14952..f976506f41 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -6,6 +6,7 @@ import re from .theplatform import ThePlatformBaseIE from ..compat import ( compat_parse_qs, + compat_str, compat_urllib_parse_urlparse, ) from ..utils import ( @@ -113,7 +114,7 @@ class MediasetIE(ThePlatformBaseIE): continue urlh = ie._request_webpage( embed_url, video_id, note='Following embed URL redirect') - embed_url = urlh.geturl() + embed_url = compat_str(urlh.geturl()) program_guid = _program_guid(_qs(embed_url)) if program_guid: entries.append(embed_url) @@ -122,7 +123,7 @@ class MediasetIE(ThePlatformBaseIE): def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): for video in smil.findall(self._xpath_ns('.//video', namespace)): video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src']) - return super(MediasetIE, self)._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url) + return super()._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url) def _real_extract(self, url): guid = self._match_id(url) diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py index d6eb157406..694a264d67 100644 --- a/youtube_dl/extractor/mediasite.py +++ b/youtube_dl/extractor/mediasite.py @@ -129,7 +129,7 @@ class MediasiteIE(InfoExtractor): query = mobj.group('query') webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer? - redirect_url = urlh.geturl() + redirect_url = compat_str(urlh.geturl()) # XXX: might have also extracted UrlReferrer and QueryString from the html service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex( diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index ad9da96125..40f214a873 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -4,8 +4,8 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, - parse_iso8601, smuggle_url, + parse_duration, ) @@ -18,18 +18,16 @@ class MiTeleIE(InfoExtractor): 'info_dict': { 'id': 'FhYW1iNTE6J6H7NkQRIEzfne6t2quqPg', 'ext': 'mp4', - 'title': 'Diario de La redacción Programa 144', - 'description': 'md5:07c35a7b11abb05876a6a79185b58d27', + 'title': 'Tor, la web invisible', + 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', 'series': 'Diario de', - 'season': 'Season 14', + 'season': 'La redacción', 'season_number': 14, - 'episode': 'Tor, la web invisible', + 'season_id': 'diario_de_t14_11981', + 'episode': 'Programa 144', 'episode_number': 3, 'thumbnail': r're:(?i)^https?://.*\.jpg$', 'duration': 2913, - 'age_limit': 16, - 'timestamp': 1471209401, - 'upload_date': '20160814', }, 'add_ie': ['Ooyala'], }, { @@ -41,15 +39,13 @@ class MiTeleIE(InfoExtractor): 'title': 'Cuarto Milenio Temporada 6 Programa 226', 'description': 'md5:5ff132013f0cd968ffbf1f5f3538a65f', 'series': 'Cuarto Milenio', - 'season': 'Season 6', + 'season': 'Temporada 6', 'season_number': 6, - 'episode': 'Episode 24', + 'season_id': 'cuarto_milenio_t06_12715', + 'episode': 'Programa 226', 'episode_number': 24, 'thumbnail': r're:(?i)^https?://.*\.jpg$', 'duration': 7313, - 'age_limit': 12, - 'timestamp': 1471209021, - 'upload_date': '20160814', }, 'params': { 'skip_download': True, @@ -58,36 +54,67 @@ class MiTeleIE(InfoExtractor): }, { 'url': 'http://www.mitele.es/series-online/la-que-se-avecina/57aac5c1c915da951a8b45ed/player', 'only_matching': True, - }, { - 'url': 'https://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144-40_1006364575251/player/', - 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - pre_player = self._parse_json(self._search_regex( - r'window\.\$REACTBASE_STATE\.prePlayer_mtweb\s*=\s*({.+})', - webpage, 'Pre Player'), display_id)['prePlayer'] - title = pre_player['title'] - video = pre_player['video'] - video_id = video['dataMediaId'] - content = pre_player.get('content') or {} - info = content.get('info') or {} + video_id = self._match_id(url) + + paths = self._download_json( + 'https://www.mitele.es/amd/agp/web/metadata/general_configuration', + video_id, 'Downloading paths JSON') + + ooyala_s = paths['general_configuration']['api_configuration']['ooyala_search'] + base_url = ooyala_s.get('base_url', 'cdn-search-mediaset.carbyne.ps.ooyala.com') + full_path = ooyala_s.get('full_path', '/search/v1/full/providers/') + source = self._download_json( + '%s://%s%s%s/docs/%s' % ( + ooyala_s.get('protocol', 'https'), base_url, full_path, + ooyala_s.get('provider_id', '104951'), video_id), + video_id, 'Downloading data JSON', query={ + 'include_titles': 'Series,Season', + 'product_name': ooyala_s.get('product_name', 'test'), + 'format': 'full', + })['hits']['hits'][0]['_source'] + + embedCode = source['offers'][0]['embed_codes'][0] + titles = source['localizable_titles'][0] + + title = titles.get('title_medium') or titles['title_long'] + + description = titles.get('summary_long') or titles.get('summary_medium') + + def get(key1, key2): + value1 = source.get(key1) + if not value1 or not isinstance(value1, list): + return + if not isinstance(value1[0], dict): + return + return value1[0].get(key2) + + series = get('localizable_titles_series', 'title_medium') + + season = get('localizable_titles_season', 'title_medium') + season_number = int_or_none(source.get('season_number')) + season_id = source.get('season_id') + + episode = titles.get('title_sort_name') + episode_number = int_or_none(source.get('episode_number')) + + duration = parse_duration(get('videos', 'duration')) return { '_type': 'url_transparent', # for some reason only HLS is supported - 'url': smuggle_url('ooyala:' + video_id, {'supportedformats': 'm3u8,dash'}), + 'url': smuggle_url('ooyala:' + embedCode, {'supportedformats': 'm3u8,dash'}), 'id': video_id, 'title': title, - 'description': info.get('synopsis'), - 'series': content.get('title'), - 'season_number': int_or_none(info.get('season_number')), - 'episode': content.get('subtitle'), - 'episode_number': int_or_none(info.get('episode_number')), - 'duration': int_or_none(info.get('duration')), - 'thumbnail': video.get('dataPoster'), - 'age_limit': int_or_none(info.get('rating')), - 'timestamp': parse_iso8601(pre_player.get('publishedTime')), + 'description': description, + 'series': series, + 'season': season, + 'season_number': season_number, + 'season_id': season_id, + 'episode': episode, + 'episode_number': episode_number, + 'duration': duration, + 'thumbnail': get('images', 'url'), } diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py index 5234cac026..1c652813ad 100644 --- a/youtube_dl/extractor/mofosex.py +++ b/youtube_dl/extractor/mofosex.py @@ -1,8 +1,5 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor from ..utils import ( int_or_none, str_to_int, @@ -57,23 +54,3 @@ class MofosexIE(KeezMoviesIE): }) return info - - -class MofosexEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=(?P\d+)' - _TESTS = [{ - 'url': 'https://www.mofosex.com/embed/?videoid=318131&referrer=KM', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r']+\bsrc=["\']((?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)', - webpage) - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result( - 'http://www.mofosex.com/videos/{0}/{0}.html'.format(video_id), - ie=MofosexIE.ie_key(), video_id=video_id) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index b1615b4d8e..43fd70f112 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -26,7 +26,7 @@ class MotherlessIE(InfoExtractor): 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], 'upload_date': '20100913', 'uploader_id': 'famouslyfuckedup', - 'thumbnail': r're:https?://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'age_limit': 18, } }, { @@ -40,7 +40,7 @@ class MotherlessIE(InfoExtractor): 'game', 'hairy'], 'upload_date': '20140622', 'uploader_id': 'Sulivana7x', - 'thumbnail': r're:https?://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'age_limit': 18, }, 'skip': '404', @@ -54,7 +54,7 @@ class MotherlessIE(InfoExtractor): 'categories': ['superheroine heroine superher'], 'upload_date': '20140827', 'uploader_id': 'shade0230', - 'thumbnail': r're:https?://.*\.jpg', + 'thumbnail': r're:http://.*\.jpg', 'age_limit': 18, } }, { @@ -76,8 +76,7 @@ class MotherlessIE(InfoExtractor): raise ExtractorError('Video %s is for friends only' % video_id, expected=True) title = self._html_search_regex( - (r'(?s)]+\bclass=["\']media-meta-title[^>]+>(.+?)', - r'id="view-upload-title">\s+([^<]+)<'), webpage, 'title') + r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') video_url = (self._html_search_regex( (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', r'fileurl\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), @@ -85,15 +84,14 @@ class MotherlessIE(InfoExtractor): or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( - (r'>(\d+)\s+Views<', r'Views\s+([^<]+)<'), + r'Views\s+([^<]+)<', webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( - (r'>(\d+)\s+Favorites<', r'Favorited\s+([^<]+)<'), + r'Favorited\s+([^<]+)<', webpage, 'like count', fatal=False)) upload_date = self._html_search_regex( - (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', - r'Uploaded\s+([^<]+)<'), webpage, 'upload date') + r'Uploaded\s+([^<]+)<', webpage, 'upload date') if 'Ago' in upload_date: days = int(re.search(r'([0-9]+)', upload_date).group(1)) upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 61fc59126f..bb3d944133 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -1,33 +1,68 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( - clean_html, - dict_get, ExtractorError, int_or_none, - parse_duration, - try_get, update_url_query, ) -class NaverBaseIE(InfoExtractor): - _CAPTION_EXT_RE = r'\.(?:ttml|vtt)' +class NaverIE(InfoExtractor): + _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/v/(?P\d+)' - def _extract_video_info(self, video_id, vid, key): + _TESTS = [{ + 'url': 'http://tv.naver.com/v/81652', + 'info_dict': { + 'id': '81652', + 'ext': 'mp4', + 'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', + 'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', + 'upload_date': '20130903', + }, + }, { + 'url': 'http://tv.naver.com/v/395837', + 'md5': '638ed4c12012c458fefcddfd01f173cd', + 'info_dict': { + 'id': '395837', + 'ext': 'mp4', + 'title': '9년이 지나도 아픈 기억, 전효성의 아버지', + 'description': 'md5:5bf200dcbf4b66eb1b350d1eb9c753f7', + 'upload_date': '20150519', + }, + 'skip': 'Georestricted', + }, { + 'url': 'http://tvcast.naver.com/v/81652', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + vid = self._search_regex( + r'videoId["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'video id', fatal=None, group='value') + in_key = self._search_regex( + r'inKey["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'key', default=None, group='value') + + if not vid or not in_key: + error = self._html_search_regex( + r'(?s)
    \s*(?:)?\s*

    (?P.+?)

    \s*
    ', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + raise ExtractorError('couldn\'t extract vid and key') video_data = self._download_json( 'http://play.rmcnmv.naver.com/vod/play/v2.0/' + vid, video_id, query={ - 'key': key, + 'key': in_key, }) meta = video_data['meta'] title = meta['subject'] formats = [] - get_list = lambda x: try_get(video_data, lambda y: y[x + 's']['list'], list) or [] def extract_formats(streams, stream_type, query={}): for stream in streams: @@ -38,7 +73,7 @@ class NaverBaseIE(InfoExtractor): encoding_option = stream.get('encodingOption', {}) bitrate = stream.get('bitrate', {}) formats.append({ - 'format_id': '%s_%s' % (stream.get('type') or stream_type, dict_get(encoding_option, ('name', 'id'))), + 'format_id': '%s_%s' % (stream.get('type') or stream_type, encoding_option.get('id') or encoding_option.get('name')), 'url': stream_url, 'width': int_or_none(encoding_option.get('width')), 'height': int_or_none(encoding_option.get('height')), @@ -48,7 +83,7 @@ class NaverBaseIE(InfoExtractor): 'protocol': 'm3u8_native' if stream_type == 'HLS' else None, }) - extract_formats(get_list('video'), 'H264') + extract_formats(video_data.get('videos', {}).get('list', []), 'H264') for stream_set in video_data.get('streams', []): query = {} for param in stream_set.get('keys', []): @@ -66,101 +101,28 @@ class NaverBaseIE(InfoExtractor): 'mp4', 'm3u8_native', m3u8_id=stream_type, fatal=False)) self._sort_formats(formats) - replace_ext = lambda x, y: re.sub(self._CAPTION_EXT_RE, '.' + y, x) - - def get_subs(caption_url): - if re.search(self._CAPTION_EXT_RE, caption_url): - return [{ - 'url': replace_ext(caption_url, 'ttml'), - }, { - 'url': replace_ext(caption_url, 'vtt'), - }] - else: - return [{'url': caption_url}] - - automatic_captions = {} subtitles = {} - for caption in get_list('caption'): + for caption in video_data.get('captions', {}).get('list', []): caption_url = caption.get('source') if not caption_url: continue - sub_dict = automatic_captions if caption.get('type') == 'auto' else subtitles - sub_dict.setdefault(dict_get(caption, ('locale', 'language')), []).extend(get_subs(caption_url)) + subtitles.setdefault(caption.get('language') or caption.get('locale'), []).append({ + 'url': caption_url, + }) - user = meta.get('user', {}) + upload_date = self._search_regex( + r']+class="date".*?(\d{4}\.\d{2}\.\d{2})', + webpage, 'upload date', fatal=False) + if upload_date: + upload_date = upload_date.replace('.', '') return { 'id': video_id, 'title': title, 'formats': formats, 'subtitles': subtitles, - 'automatic_captions': automatic_captions, - 'thumbnail': try_get(meta, lambda x: x['cover']['source']), + 'description': self._og_search_description(webpage), + 'thumbnail': meta.get('cover', {}).get('source') or self._og_search_thumbnail(webpage), 'view_count': int_or_none(meta.get('count')), - 'uploader_id': user.get('id'), - 'uploader': user.get('name'), - 'uploader_url': user.get('url'), + 'upload_date': upload_date, } - - -class NaverIE(NaverBaseIE): - _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/(?:v|embed)/(?P\d+)' - _GEO_BYPASS = False - _TESTS = [{ - 'url': 'http://tv.naver.com/v/81652', - 'info_dict': { - 'id': '81652', - 'ext': 'mp4', - 'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', - 'description': '메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', - 'timestamp': 1378200754, - 'upload_date': '20130903', - 'uploader': '메가스터디, 합격불변의 법칙', - 'uploader_id': 'megastudy', - }, - }, { - 'url': 'http://tv.naver.com/v/395837', - 'md5': '8a38e35354d26a17f73f4e90094febd3', - 'info_dict': { - 'id': '395837', - 'ext': 'mp4', - 'title': '9년이 지나도 아픈 기억, 전효성의 아버지', - 'description': 'md5:eb6aca9d457b922e43860a2a2b1984d3', - 'timestamp': 1432030253, - 'upload_date': '20150519', - 'uploader': '4가지쇼 시즌2', - 'uploader_id': 'wrappinguser29', - }, - 'skip': 'Georestricted', - }, { - 'url': 'http://tvcast.naver.com/v/81652', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - content = self._download_json( - 'https://tv.naver.com/api/json/v/' + video_id, - video_id, headers=self.geo_verification_headers()) - player_info_json = content.get('playerInfoJson') or {} - current_clip = player_info_json.get('currentClip') or {} - - vid = current_clip.get('videoId') - in_key = current_clip.get('inKey') - - if not vid or not in_key: - player_auth = try_get(player_info_json, lambda x: x['playerOption']['auth']) - if player_auth == 'notCountry': - self.raise_geo_restricted(countries=['KR']) - elif player_auth == 'notLogin': - self.raise_login_required() - raise ExtractorError('couldn\'t extract vid and key') - info = self._extract_video_info(video_id, vid, in_key) - info.update({ - 'description': clean_html(current_clip.get('description')), - 'timestamp': int_or_none(current_clip.get('firstExposureTime'), 1000), - 'duration': parse_duration(current_clip.get('displayPlayTime')), - 'like_count': int_or_none(current_clip.get('recommendPoint')), - 'age_limit': 19 if current_clip.get('adult') else None, - }) - return info diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 6f3cb30034..5bc39d0024 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -87,25 +87,11 @@ class NBCIE(AdobePassIE): def _real_extract(self, url): permalink, video_id = re.match(self._VALID_URL, url).groups() permalink = 'http' + compat_urllib_parse_unquote(permalink) - video_data = self._download_json( + response = self._download_json( 'https://friendship.nbc.co/v2/graphql', video_id, query={ - 'query': '''query bonanzaPage( - $app: NBCUBrands! = nbc - $name: String! - $oneApp: Boolean - $platform: SupportedPlatforms! = web - $type: EntityPageType! = VIDEO - $userId: String! -) { - bonanzaPage( - app: $app - name: $name - oneApp: $oneApp - platform: $platform - type: $type - userId: $userId - ) { - metadata { + 'query': '''{ + page(name: "%s", platform: web, type: VIDEO, userId: "0") { + data { ... on VideoPageData { description episodeNumber @@ -114,20 +100,15 @@ class NBCIE(AdobePassIE): mpxAccountId mpxGuid rating - resourceId seasonNumber secondaryTitle seriesShortTitle } } } -}''', - 'variables': json.dumps({ - 'name': permalink, - 'oneApp': True, - 'userId': '0', - }), - })['data']['bonanzaPage']['metadata'] +}''' % permalink, + }) + video_data = response['data']['page']['data'] query = { 'mbr': 'true', 'manifest': 'm3u', @@ -136,8 +117,8 @@ class NBCIE(AdobePassIE): title = video_data['secondaryTitle'] if video_data.get('locked'): resource = self._get_mvpd_resource( - video_data.get('resourceId') or 'nbcentertainment', - title, video_id, video_data.get('rating')) + 'nbcentertainment', title, video_id, + video_data.get('rating')) query['auth'] = self._extract_mvpd_auth( url, video_id, 'nbcentertainment', resource) theplatform_url = smuggle_url(update_url_query( diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 2447c812e0..aec2ea1331 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -7,11 +7,8 @@ from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, - merge_dicts, parse_iso8601, qualities, - try_get, - urljoin, ) @@ -88,25 +85,21 @@ class NDRIE(NDRBaseIE): def _extract_embed(self, webpage, display_id): embed_url = self._html_search_meta( - 'embedURL', webpage, 'embed URL', - default=None) or self._search_regex( - r'\bembedUrl["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'embed URL', group='url') + 'embedURL', webpage, 'embed URL', fatal=True) description = self._search_regex( r']+itemprop="description">([^<]+)

    ', webpage, 'description', default=None) or self._og_search_description(webpage) timestamp = parse_iso8601( self._search_regex( r']+itemprop="(?:datePublished|uploadDate)"[^>]+content="([^"]+)"', - webpage, 'upload date', default=None)) - info = self._search_json_ld(webpage, display_id, default={}) - return merge_dicts({ + webpage, 'upload date', fatal=False)) + return { '_type': 'url_transparent', 'url': embed_url, 'display_id': display_id, 'description': description, 'timestamp': timestamp, - }, info) + } class NJoyIE(NDRBaseIE): @@ -227,17 +220,11 @@ class NDREmbedBaseIE(InfoExtractor): upload_date = ppjson.get('config', {}).get('publicationDate') duration = int_or_none(config.get('duration')) - thumbnails = [] - poster = try_get(config, lambda x: x['poster'], dict) or {} - for thumbnail_id, thumbnail in poster.items(): - thumbnail_url = urljoin(url, thumbnail.get('src')) - if not thumbnail_url: - continue - thumbnails.append({ - 'id': thumbnail.get('quality') or thumbnail_id, - 'url': thumbnail_url, - 'preference': quality_key(thumbnail.get('quality')), - }) + thumbnails = [{ + 'id': thumbnail.get('quality') or thumbnail_id, + 'url': thumbnail['src'], + 'preference': quality_key(thumbnail.get('quality')), + } for thumbnail_id, thumbnail in config.get('poster', {}).items() if thumbnail.get('src')] return { 'id': video_id, diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index de6a707c42..6a2c6cb7bb 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class NhkVodIE(InfoExtractor): - _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P[a-z]{2})/ondemand/(?Pvideo|audio)/(?P\d{7}|[^/]+?-\d{8}-\d+)' + _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P[a-z]{2})/ondemand/(?Pvideo|audio)/(?P\d{7}|[a-z]+-\d{8}-\d+)' # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ @@ -30,11 +30,8 @@ class NhkVodIE(InfoExtractor): }, { 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/', 'only_matching': True, - }, { - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/', - 'only_matching': True, }] - _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/episode/%s/%s/all%s.json' + _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7/episode/%s/%s/all%s.json' def _real_extract(self, url): lang, m_type, episode_id = re.match(self._VALID_URL, url).groups() @@ -85,9 +82,15 @@ class NhkVodIE(InfoExtractor): audio = episode['audio'] audio_path = audio['audio'] info['formats'] = self._extract_m3u8_formats( - 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, - episode_id, 'm4a', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False) + 'https://nhks-vh.akamaihd.net/i%s/master.m3u8' % audio_path, + episode_id, 'm4a', m3u8_id='hls', fatal=False) + for proto in ('rtmpt', 'rtmp'): + info['formats'].append({ + 'ext': 'flv', + 'format_id': proto, + 'url': '%s://flv.nhk.or.jp/ondemand/mp4:flv%s' % (proto, audio_path), + 'vcodec': 'none', + }) for f in info['formats']: f['language'] = lang return info diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 47b9748f02..901f44b54f 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -6,7 +6,6 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, - determine_ext, int_or_none, js_to_json, qualities, @@ -19,7 +18,7 @@ class NovaEmbedIE(InfoExtractor): _VALID_URL = r'https?://media\.cms\.nova\.cz/embed/(?P[^/?#&]+)' _TEST = { 'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1', - 'md5': 'ee009bafcc794541570edd44b71cbea3', + 'md5': 'b3834f6de5401baabf31ed57456463f7', 'info_dict': { 'id': '8o0n0r', 'ext': 'mp4', @@ -34,76 +33,36 @@ class NovaEmbedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - duration = None - formats = [] - - player = self._parse_json( + bitrates = self._parse_json( self._search_regex( - r'Player\.init\s*\([^,]+,\s*({.+?})\s*,\s*{.+?}\s*\)\s*;', - webpage, 'player', default='{}'), video_id, fatal=False) - if player: - for format_id, format_list in player['tracks'].items(): - if not isinstance(format_list, list): - format_list = [format_list] - for format_dict in format_list: - if not isinstance(format_dict, dict): - continue - format_url = url_or_none(format_dict.get('src')) - format_type = format_dict.get('type') - ext = determine_ext(format_url) - if (format_type == 'application/x-mpegURL' - or format_id == 'HLS' or ext == 'm3u8'): - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - elif (format_type == 'application/dash+xml' - or format_id == 'DASH' or ext == 'mpd'): - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'url': format_url, + r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), + video_id, transform_source=js_to_json) + + QUALITIES = ('lq', 'mq', 'hq', 'hd') + quality_key = qualities(QUALITIES) + + formats = [] + for format_id, format_list in bitrates.items(): + if not isinstance(format_list, list): + continue + for format_url in format_list: + format_url = url_or_none(format_url) + if not format_url: + continue + f = { + 'url': format_url, + } + f_id = format_id + for quality in QUALITIES: + if '%s.mp4' % quality in format_url: + f_id += '-%s' % quality + f.update({ + 'quality': quality_key(quality), + 'format_note': quality.upper(), }) - duration = int_or_none(player.get('duration')) - else: - # Old path, not actual as of 08.04.2020 - bitrates = self._parse_json( - self._search_regex( - r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), - video_id, transform_source=js_to_json) - - QUALITIES = ('lq', 'mq', 'hq', 'hd') - quality_key = qualities(QUALITIES) - - for format_id, format_list in bitrates.items(): - if not isinstance(format_list, list): - format_list = [format_list] - for format_url in format_list: - format_url = url_or_none(format_url) - if not format_url: - continue - if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - continue - f = { - 'url': format_url, - } - f_id = format_id - for quality in QUALITIES: - if '%s.mp4' % quality in format_url: - f_id += '-%s' % quality - f.update({ - 'quality': quality_key(quality), - 'format_note': quality.upper(), - }) - break - f['format_id'] = f_id - formats.append(f) - + break + f['format_id'] = f_id + formats.append(f) self._sort_formats(formats) title = self._og_search_title( @@ -116,8 +75,7 @@ class NovaEmbedIE(InfoExtractor): r'poster\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'thumbnail', fatal=False, group='value') duration = int_or_none(self._search_regex( - r'videoDuration\s*:\s*(\d+)', webpage, 'duration', - default=duration)) + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) return { 'id': video_id, @@ -133,7 +91,7 @@ class NovaIE(InfoExtractor): _VALID_URL = r'https?://(?:[^.]+\.)?(?Ptv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P[^/]+?)(?:\.html|/|$)' _TESTS = [{ 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260', - 'md5': '249baab7d0104e186e78b0899c7d5f28', + 'md5': '1dd7b9d5ea27bc361f110cd855a19bd3', 'info_dict': { 'id': '1757139', 'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci', @@ -155,8 +113,7 @@ class NovaIE(InfoExtractor): 'params': { # rtmp download 'skip_download': True, - }, - 'skip': 'gone', + } }, { # media.cms.nova.cz embed 'url': 'https://novaplus.nova.cz/porad/ulice/epizoda/18760-2180-dil', @@ -171,7 +128,6 @@ class NovaIE(InfoExtractor): 'skip_download': True, }, 'add_ie': [NovaEmbedIE.ie_key()], - 'skip': 'CHYBA 404: STRÁNKA NENALEZENA', }, { 'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html', 'only_matching': True, @@ -196,29 +152,14 @@ class NovaIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - description = clean_html(self._og_search_description(webpage, default=None)) - if site == 'novaplus': - upload_date = unified_strdate(self._search_regex( - r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None)) - elif site == 'fanda': - upload_date = unified_strdate(self._search_regex( - r'(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None)) - else: - upload_date = None - # novaplus embed_id = self._search_regex( r']+\bsrc=["\'](?:https?:)?//media\.cms\.nova\.cz/embed/([^/?#&]+)', webpage, 'embed url', default=None) if embed_id: - return { - '_type': 'url_transparent', - 'url': 'https://media.cms.nova.cz/embed/%s' % embed_id, - 'ie_key': NovaEmbedIE.ie_key(), - 'id': embed_id, - 'description': description, - 'upload_date': upload_date - } + return self.url_result( + 'https://media.cms.nova.cz/embed/%s' % embed_id, + ie=NovaEmbedIE.ie_key(), video_id=embed_id) video_id = self._search_regex( [r"(?:media|video_id)\s*:\s*'(\d+)'", @@ -292,8 +233,18 @@ class NovaIE(InfoExtractor): self._sort_formats(formats) title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) + description = clean_html(self._og_search_description(webpage, default=None)) thumbnail = config.get('poster') + if site == 'novaplus': + upload_date = unified_strdate(self._search_regex( + r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None)) + elif site == 'fanda': + upload_date = unified_strdate(self._search_regex( + r'(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None)) + else: + upload_date = None + return { 'id': video_id, 'display_id': display_id, diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py index 53acc6e574..a5e8baa7e2 100644 --- a/youtube_dl/extractor/npr.py +++ b/youtube_dl/extractor/npr.py @@ -4,7 +4,6 @@ from .common import InfoExtractor from ..utils import ( int_or_none, qualities, - url_or_none, ) @@ -49,10 +48,6 @@ class NprIE(InfoExtractor): }, }], 'expected_warnings': ['Failed to download m3u8 information'], - }, { - # multimedia, no formats, stream - 'url': 'https://www.npr.org/2020/02/14/805476846/laura-stevenson-tiny-desk-concert', - 'only_matching': True, }] def _real_extract(self, url): @@ -100,17 +95,6 @@ class NprIE(InfoExtractor): 'format_id': format_id, 'quality': quality(format_id), }) - for stream_id, stream_entry in media.get('stream', {}).items(): - if not isinstance(stream_entry, dict): - continue - if stream_id != 'hlsUrl': - continue - stream_url = url_or_none(stream_entry.get('$text')) - if not stream_url: - continue - formats.extend(self._extract_m3u8_formats( - stream_url, stream_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) self._sort_formats(formats) entries.append({ diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 94115534b7..60933f069c 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -12,7 +12,6 @@ from ..utils import ( ExtractorError, int_or_none, JSON_LD_RE, - js_to_json, NO_DEFAULT, parse_age_limit, parse_duration, @@ -106,7 +105,6 @@ class NRKBaseIE(InfoExtractor): MESSAGES = { 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', 'ProgramRightsHasExpired': 'Programmet har gått ut', - 'NoProgramRights': 'Ikke tilgjengelig', 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', } message_type = data.get('messageType', '') @@ -257,17 +255,6 @@ class NRKTVIE(NRKBaseIE): ''' % _EPISODE_RE _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ - 'url': 'https://tv.nrk.no/program/MDDP12000117', - 'md5': '8270824df46ec629b66aeaa5796b36fb', - 'info_dict': { - 'id': 'MDDP12000117AA', - 'ext': 'mp4', - 'title': 'Alarm Trolltunga', - 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', - 'duration': 2223, - 'age_limit': 6, - }, - }, { 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'md5': '9a167e54d04671eb6317a37b7bc8a280', 'info_dict': { @@ -279,7 +266,6 @@ class NRKTVIE(NRKBaseIE): 'series': '20 spørsmål', 'episode': '23.05.2014', }, - 'skip': 'NoProgramRights', }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', 'info_dict': { @@ -384,24 +370,7 @@ class NRKTVIE(NRKBaseIE): class NRKTVEpisodeIE(InfoExtractor): _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P[^/]+/sesong/\d+/episode/\d+)' - _TESTS = [{ - 'url': 'https://tv.nrk.no/serie/hellums-kro/sesong/1/episode/2', - 'info_dict': { - 'id': 'MUHH36005220BA', - 'ext': 'mp4', - 'title': 'Kro, krig og kjærlighet 2:6', - 'description': 'md5:b32a7dc0b1ed27c8064f58b97bda4350', - 'duration': 1563, - 'series': 'Hellums kro', - 'season_number': 1, - 'episode_number': 2, - 'episode': '2:6', - 'age_limit': 6, - }, - 'params': { - 'skip_download': True, - }, - }, { + _TEST = { 'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8', 'info_dict': { 'id': 'MSUI14000816AA', @@ -417,8 +386,7 @@ class NRKTVEpisodeIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'skip': 'ProgramRightsHasExpired', - }] + } def _real_extract(self, url): display_id = self._match_id(url) @@ -441,7 +409,7 @@ class NRKTVSerieBaseIE(InfoExtractor): (r'INITIAL_DATA(?:_V\d)?_*\s*=\s*({.+?})\s*;', r'({.+?})\s*,\s*"[^"]+"\s*\)\s*'), webpage, 'config', default='{}' if not fatal else NO_DEFAULT), - display_id, fatal=False, transform_source=js_to_json) + display_id, fatal=False) if not config: return return try_get( @@ -511,14 +479,6 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P[^/]+)' _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P\d+)' _TESTS = [{ - 'url': 'https://tv.nrk.no/serie/blank', - 'info_dict': { - 'id': 'blank', - 'title': 'Blank', - 'description': 'md5:7664b4e7e77dc6810cd3bca367c25b6e', - }, - 'playlist_mincount': 30, - }, { # new layout, seasons 'url': 'https://tv.nrk.no/serie/backstage', 'info_dict': { @@ -688,7 +648,7 @@ class NRKSkoleIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.nrk.no/skole/?page=search&q=&mediaId=14099', - 'md5': '18c12c3d071953c3bf8d54ef6b2587b7', + 'md5': '6bc936b01f9dd8ed45bc58b252b2d9b6', 'info_dict': { 'id': '6021', 'ext': 'mp4', diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index fc78ca56c9..2bb77ab249 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -69,10 +69,10 @@ class NYTimesBaseIE(InfoExtractor): 'width': int_or_none(video.get('width')), 'height': int_or_none(video.get('height')), 'filesize': get_file_size(video.get('file_size') or video.get('fileSize')), - 'tbr': int_or_none(video.get('bitrate'), 1000) or None, + 'tbr': int_or_none(video.get('bitrate'), 1000), 'ext': ext, }) - self._sort_formats(formats, ('height', 'width', 'filesize', 'tbr', 'fps', 'format_id')) + self._sort_formats(formats) thumbnails = [] for image in video_data.get('images', []): diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 700ce448c4..3425f76024 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -6,14 +6,12 @@ import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - clean_html, determine_ext, float_or_none, HEADRequest, int_or_none, orderedSet, remove_end, - str_or_none, strip_jsonp, unescapeHTML, unified_strdate, @@ -90,11 +88,8 @@ class ORFTVthekIE(InfoExtractor): format_id = '-'.join(format_id_list) ext = determine_ext(src) if ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id=format_id, fatal=False) - if any('/geoprotection' in f['url'] for f in m3u8_formats): - self.raise_geo_restricted() - formats.extend(m3u8_formats) + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', m3u8_id=format_id, fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( src, video_id, f4m_id=format_id, fatal=False)) @@ -162,53 +157,48 @@ class ORFTVthekIE(InfoExtractor): class ORFRadioIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + station = mobj.group('station') show_date = mobj.group('date') show_id = mobj.group('show') - data = self._download_json( - 'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' - % (self._API_STATION, show_id, show_date), show_id) + if station == 'fm4': + show_id = '4%s' % show_id - entries = [] - for info in data['streams']: - loop_stream_id = str_or_none(info.get('loopStreamId')) - if not loop_stream_id: - continue - title = str_or_none(data.get('title')) - if not title: - continue - start = int_or_none(info.get('start'), scale=1000) - end = int_or_none(info.get('end'), scale=1000) - duration = end - start if end and start else None - entries.append({ - 'id': loop_stream_id.replace('.mp3', ''), - 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id), + data = self._download_json( + 'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' % (station, show_id, show_date), + show_id + ) + + def extract_entry_dict(info, title, subtitle): + return { + 'id': info['loopStreamId'].replace('.mp3', ''), + 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (station, info['loopStreamId']), 'title': title, - 'description': clean_html(data.get('subtitle')), - 'duration': duration, - 'timestamp': start, + 'description': subtitle, + 'duration': (info['end'] - info['start']) / 1000, + 'timestamp': info['start'] / 1000, 'ext': 'mp3', - 'series': data.get('programTitle'), - }) + 'series': data.get('programTitle') + } + + entries = [extract_entry_dict(t, data['title'], data['subtitle']) for t in data['streams']] return { '_type': 'playlist', 'id': show_id, - 'title': data.get('title'), - 'description': clean_html(data.get('subtitle')), - 'entries': entries, + 'title': data['title'], + 'description': data['subtitle'], + 'entries': entries } class ORFFM4IE(ORFRadioIE): IE_NAME = 'orf:fm4' IE_DESC = 'radio FM4' - _VALID_URL = r'https?://(?Pfm4)\.orf\.at/player/(?P[0-9]+)/(?P4\w+)' - _API_STATION = 'fm4' - _LOOP_STATION = 'fm4' + _VALID_URL = r'https?://(?Pfm4)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' _TEST = { - 'url': 'http://fm4.orf.at/player/20170107/4CC', + 'url': 'http://fm4.orf.at/player/20170107/CC', 'md5': '2b0be47375432a7ef104453432a19212', 'info_dict': { 'id': '2017-01-07_2100_tl_54_7DaysSat18_31295', @@ -219,138 +209,7 @@ class ORFFM4IE(ORFRadioIE): 'timestamp': 1483819257, 'upload_date': '20170107', }, - 'skip': 'Shows from ORF radios are only available for 7 days.', - 'only_matching': True, - } - - -class ORFNOEIE(ORFRadioIE): - IE_NAME = 'orf:noe' - IE_DESC = 'Radio Niederösterreich' - _VALID_URL = r'https?://(?Pnoe)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' - _API_STATION = 'noe' - _LOOP_STATION = 'oe2n' - - _TEST = { - 'url': 'https://noe.orf.at/player/20200423/NGM', - 'only_matching': True, - } - - -class ORFWIEIE(ORFRadioIE): - IE_NAME = 'orf:wien' - IE_DESC = 'Radio Wien' - _VALID_URL = r'https?://(?Pwien)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' - _API_STATION = 'wie' - _LOOP_STATION = 'oe2w' - - _TEST = { - 'url': 'https://wien.orf.at/player/20200423/WGUM', - 'only_matching': True, - } - - -class ORFBGLIE(ORFRadioIE): - IE_NAME = 'orf:burgenland' - IE_DESC = 'Radio Burgenland' - _VALID_URL = r'https?://(?Pburgenland)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' - _API_STATION = 'bgl' - _LOOP_STATION = 'oe2b' - - _TEST = { - 'url': 'https://burgenland.orf.at/player/20200423/BGM', - 'only_matching': True, - } - - -class ORFOOEIE(ORFRadioIE): - IE_NAME = 'orf:oberoesterreich' - IE_DESC = 'Radio Oberösterreich' - _VALID_URL = r'https?://(?Pooe)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' - _API_STATION = 'ooe' - _LOOP_STATION = 'oe2o' - - _TEST = { - 'url': 'https://ooe.orf.at/player/20200423/OGMO', - 'only_matching': True, - } - - -class ORFSTMIE(ORFRadioIE): - IE_NAME = 'orf:steiermark' - IE_DESC = 'Radio Steiermark' - _VALID_URL = r'https?://(?Psteiermark)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' - _API_STATION = 'stm' - _LOOP_STATION = 'oe2st' - - _TEST = { - 'url': 'https://steiermark.orf.at/player/20200423/STGMS', - 'only_matching': True, - } - - -class ORFKTNIE(ORFRadioIE): - IE_NAME = 'orf:kaernten' - IE_DESC = 'Radio Kärnten' - _VALID_URL = r'https?://(?Pkaernten)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' - _API_STATION = 'ktn' - _LOOP_STATION = 'oe2k' - - _TEST = { - 'url': 'https://kaernten.orf.at/player/20200423/KGUMO', - 'only_matching': True, - } - - -class ORFSBGIE(ORFRadioIE): - IE_NAME = 'orf:salzburg' - IE_DESC = 'Radio Salzburg' - _VALID_URL = r'https?://(?Psalzburg)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' - _API_STATION = 'sbg' - _LOOP_STATION = 'oe2s' - - _TEST = { - 'url': 'https://salzburg.orf.at/player/20200423/SGUM', - 'only_matching': True, - } - - -class ORFTIRIE(ORFRadioIE): - IE_NAME = 'orf:tirol' - IE_DESC = 'Radio Tirol' - _VALID_URL = r'https?://(?Ptirol)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' - _API_STATION = 'tir' - _LOOP_STATION = 'oe2t' - - _TEST = { - 'url': 'https://tirol.orf.at/player/20200423/TGUMO', - 'only_matching': True, - } - - -class ORFVBGIE(ORFRadioIE): - IE_NAME = 'orf:vorarlberg' - IE_DESC = 'Radio Vorarlberg' - _VALID_URL = r'https?://(?Pvorarlberg)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' - _API_STATION = 'vbg' - _LOOP_STATION = 'oe2v' - - _TEST = { - 'url': 'https://vorarlberg.orf.at/player/20200423/VGUM', - 'only_matching': True, - } - - -class ORFOE3IE(ORFRadioIE): - IE_NAME = 'orf:oe3' - IE_DESC = 'Radio Österreich 3' - _VALID_URL = r'https?://(?Poe3)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' - _API_STATION = 'oe3' - _LOOP_STATION = 'oe3' - - _TEST = { - 'url': 'https://oe3.orf.at/player/20200424/3WEK', - 'only_matching': True, + 'skip': 'Shows from ORF radios are only available for 7 days.' } @@ -358,8 +217,6 @@ class ORFOE1IE(ORFRadioIE): IE_NAME = 'orf:oe1' IE_DESC = 'Radio Österreich 1' _VALID_URL = r'https?://(?Poe1)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' - _API_STATION = 'oe1' - _LOOP_STATION = 'oe1' _TEST = { 'url': 'http://oe1.orf.at/player/20170108/456544', diff --git a/youtube_dl/extractor/pandatv.py b/youtube_dl/extractor/pandatv.py new file mode 100644 index 0000000000..4219802d52 --- /dev/null +++ b/youtube_dl/extractor/pandatv.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + qualities, +) + + +class PandaTVIE(InfoExtractor): + IE_DESC = '熊猫TV' + _VALID_URL = r'https?://(?:www\.)?panda\.tv/(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://www.panda.tv/66666', + 'info_dict': { + 'id': '66666', + 'title': 're:.+', + 'uploader': '刘杀鸡', + 'ext': 'flv', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Live stream is offline', + }, { + 'url': 'https://www.panda.tv/66666', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + config = self._download_json( + 'https://www.panda.tv/api_room_v2?roomid=%s' % video_id, video_id) + + error_code = config.get('errno', 0) + if error_code != 0: + raise ExtractorError( + '%s returned error %s: %s' + % (self.IE_NAME, error_code, config['errmsg']), + expected=True) + + data = config['data'] + video_info = data['videoinfo'] + + # 2 = live, 3 = offline + if video_info.get('status') != '2': + raise ExtractorError( + 'Live stream is offline', expected=True) + + title = data['roominfo']['name'] + uploader = data.get('hostinfo', {}).get('name') + room_key = video_info['room_key'] + stream_addr = video_info.get( + 'stream_addr', {'OD': '1', 'HD': '1', 'SD': '1'}) + + # Reverse engineered from web player swf + # (http://s6.pdim.gs/static/07153e425f581151.swf at the moment of + # writing). + plflag0, plflag1 = video_info['plflag'].split('_') + plflag0 = int(plflag0) - 1 + if plflag1 == '21': + plflag0 = 10 + plflag1 = '4' + live_panda = 'live_panda' if plflag0 < 1 else '' + + plflag_auth = self._parse_json(video_info['plflag_list'], video_id) + sign = plflag_auth['auth']['sign'] + ts = plflag_auth['auth']['time'] + rid = plflag_auth['auth']['rid'] + + quality_key = qualities(['OD', 'HD', 'SD']) + suffix = ['_small', '_mid', ''] + formats = [] + for k, v in stream_addr.items(): + if v != '1': + continue + quality = quality_key(k) + if quality <= 0: + continue + for pref, (ext, pl) in enumerate((('m3u8', '-hls'), ('flv', ''))): + formats.append({ + 'url': 'https://pl%s%s.live.panda.tv/live_panda/%s%s%s.%s?sign=%s&ts=%s&rid=%s' + % (pl, plflag1, room_key, live_panda, suffix[quality], ext, sign, ts, rid), + 'format_id': '%s-%s' % (k, ext), + 'quality': quality, + 'source_preference': pref, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title(title), + 'uploader': uploader, + 'formats': formats, + 'is_live': True, + } diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index 48fb954169..d3a83ea2bb 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -8,7 +8,6 @@ from ..compat import compat_str from ..utils import ( int_or_none, parse_resolution, - str_or_none, try_get, unified_timestamp, url_or_none, @@ -416,7 +415,6 @@ class PeerTubeIE(InfoExtractor): peertube\.cpy\.re )''' _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' - _API_BASE = 'https://%s/api/v1/videos/%s/%s' _VALID_URL = r'''(?x) (?: peertube:(?P[^:]+):| @@ -425,30 +423,26 @@ class PeerTubeIE(InfoExtractor): (?P%s) ''' % (_INSTANCES_RE, _UUID_RE) _TESTS = [{ - 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', - 'md5': '9bed8c0137913e17b86334e5885aacff', + 'url': 'https://peertube.cpy.re/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'md5': '80f24ff364cc9d333529506a263e7feb', 'info_dict': { - 'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d', + 'id': '2790feb0-8120-4e63-9af3-c943c69f5e6c', 'ext': 'mp4', - 'title': 'What is PeerTube?', - 'description': 'md5:3fefb8dde2b189186ce0719fda6f7b10', + 'title': 'wow', + 'description': 'wow such video, so gif', 'thumbnail': r're:https?://.*\.(?:jpg|png)', - 'timestamp': 1538391166, - 'upload_date': '20181001', - 'uploader': 'Framasoft', - 'uploader_id': '3', - 'uploader_url': 'https://framatube.org/accounts/framasoft', - 'channel': 'Les vidéos de Framasoft', - 'channel_id': '2', - 'channel_url': 'https://framatube.org/video-channels/bf54d359-cfad-4935-9d45-9d6be93f63e8', - 'language': 'en', - 'license': 'Attribution - Share Alike', - 'duration': 113, + 'timestamp': 1519297480, + 'upload_date': '20180222', + 'uploader': 'Luclu7', + 'uploader_id': '7fc42640-efdb-4505-a45d-a15b1a5496f1', + 'uploder_url': 'https://peertube.nsa.ovh/accounts/luclu7', + 'license': 'Unknown', + 'duration': 3, 'view_count': int, 'like_count': int, 'dislike_count': int, - 'tags': ['framasoft', 'peertube'], - 'categories': ['Science & Technology'], + 'tags': list, + 'categories': list, } }, { 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', @@ -490,38 +484,13 @@ class PeerTubeIE(InfoExtractor): entries = [peertube_url] return entries - def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True): - return self._download_json( - self._API_BASE % (host, video_id, path), video_id, - note=note, errnote=errnote, fatal=fatal) - - def _get_subtitles(self, host, video_id): - captions = self._call_api( - host, video_id, 'captions', note='Downloading captions JSON', - fatal=False) - if not isinstance(captions, dict): - return - data = captions.get('data') - if not isinstance(data, list): - return - subtitles = {} - for e in data: - language_id = try_get(e, lambda x: x['language']['id'], compat_str) - caption_url = urljoin('https://%s' % host, e.get('captionPath')) - if not caption_url: - continue - subtitles.setdefault(language_id or 'en', []).append({ - 'url': caption_url, - }) - return subtitles - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) host = mobj.group('host') or mobj.group('host_2') video_id = mobj.group('id') - video = self._call_api( - host, video_id, '', note='Downloading video JSON') + video = self._download_json( + 'https://%s/api/v1/videos/%s' % (host, video_id), video_id) title = video['name'] @@ -544,28 +513,10 @@ class PeerTubeIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - full_description = self._call_api( - host, video_id, 'description', note='Downloading description JSON', - fatal=False) + def account_data(field): + return try_get(video, lambda x: x['account'][field], compat_str) - description = None - if isinstance(full_description, dict): - description = str_or_none(full_description.get('description')) - if not description: - description = video.get('description') - - subtitles = self.extract_subtitles(host, video_id) - - def data(section, field, type_): - return try_get(video, lambda x: x[section][field], type_) - - def account_data(field, type_): - return data('account', field, type_) - - def channel_data(field, type_): - return data('channel', field, type_) - - category = data('category', 'label', compat_str) + category = try_get(video, lambda x: x['category']['label'], compat_str) categories = [category] if category else None nsfw = video.get('nsfw') @@ -577,17 +528,14 @@ class PeerTubeIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': description, + 'description': video.get('description'), 'thumbnail': urljoin(url, video.get('thumbnailPath')), 'timestamp': unified_timestamp(video.get('publishedAt')), - 'uploader': account_data('displayName', compat_str), - 'uploader_id': str_or_none(account_data('id', int)), - 'uploader_url': url_or_none(account_data('url', compat_str)), - 'channel': channel_data('displayName', compat_str), - 'channel_id': str_or_none(channel_data('id', int)), - 'channel_url': url_or_none(channel_data('url', compat_str)), - 'language': data('language', 'id', compat_str), - 'license': data('licence', 'label', compat_str), + 'uploader': account_data('displayName'), + 'uploader_id': account_data('uuid'), + 'uploder_url': account_data('url'), + 'license': try_get( + video, lambda x: x['licence']['label'], compat_str), 'duration': int_or_none(video.get('duration')), 'view_count': int_or_none(video.get('views')), 'like_count': int_or_none(video.get('likes')), @@ -596,5 +544,4 @@ class PeerTubeIE(InfoExtractor): 'tags': try_get(video, lambda x: x['tags'], list), 'categories': categories, 'formats': formats, - 'subtitles': subtitles } diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index b15906390d..c02e34abac 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -18,7 +18,7 @@ class PeriscopeBaseIE(InfoExtractor): item_id, query=query) def _parse_broadcast_data(self, broadcast, video_id): - title = broadcast.get('status') or 'Periscope Broadcast' + title = broadcast['status'] uploader = broadcast.get('user_display_name') or broadcast.get('username') title = '%s - %s' % (uploader, title) if uploader else title is_live = broadcast.get('state').lower() == 'running' diff --git a/youtube_dl/extractor/platzi.py b/youtube_dl/extractor/platzi.py index 23c8256b59..602207bebd 100644 --- a/youtube_dl/extractor/platzi.py +++ b/youtube_dl/extractor/platzi.py @@ -46,7 +46,7 @@ class PlatziBaseIE(InfoExtractor): headers={'Referer': self._LOGIN_URL}) # login succeeded - if 'platzi.com/login' not in urlh.geturl(): + if 'platzi.com/login' not in compat_str(urlh.geturl()): return login_error = self._webpage_read_content( diff --git a/youtube_dl/extractor/pokemon.py b/youtube_dl/extractor/pokemon.py index 80222d4283..dd5f17f119 100644 --- a/youtube_dl/extractor/pokemon.py +++ b/youtube_dl/extractor/pokemon.py @@ -20,16 +20,20 @@ class PokemonIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Ol’ Raise and Switch!', 'description': 'md5:7db77f7107f98ba88401d3adc80ff7af', + 'timestamp': 1511824728, + 'upload_date': '20171127', }, 'add_id': ['LimelightMedia'], }, { # no data-video-title - 'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008', + 'url': 'https://www.pokemon.com/us/pokemon-episodes/pokemon-movies/pokemon-the-rise-of-darkrai-2008', 'info_dict': { - 'id': 'dfbaf830d7e54e179837c50c0c6cc0e1', + 'id': '99f3bae270bf4e5097274817239ce9c8', 'ext': 'mp4', - 'title': "Pokémon : L'ascension de Darkrai", - 'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5', + 'title': 'Pokémon: The Rise of Darkrai', + 'description': 'md5:ea8fbbf942e1e497d54b19025dd57d9d', + 'timestamp': 1417778347, + 'upload_date': '20141205', }, 'add_id': ['LimelightMedia'], 'params': { diff --git a/youtube_dl/extractor/popcorntimes.py b/youtube_dl/extractor/popcorntimes.py deleted file mode 100644 index 7bf7f98587..0000000000 --- a/youtube_dl/extractor/popcorntimes.py +++ /dev/null @@ -1,99 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_chr, -) -from ..utils import int_or_none - - -class PopcorntimesIE(InfoExtractor): - _VALID_URL = r'https?://popcorntimes\.tv/[^/]+/m/(?P[^/]+)/(?P[^/?#&]+)' - _TEST = { - 'url': 'https://popcorntimes.tv/de/m/A1XCFvz/haensel-und-gretel-opera-fantasy', - 'md5': '93f210991ad94ba8c3485950a2453257', - 'info_dict': { - 'id': 'A1XCFvz', - 'display_id': 'haensel-und-gretel-opera-fantasy', - 'ext': 'mp4', - 'title': 'Hänsel und Gretel', - 'description': 'md5:1b8146791726342e7b22ce8125cf6945', - 'thumbnail': r're:^https?://.*\.jpg$', - 'creator': 'John Paul', - 'release_date': '19541009', - 'duration': 4260, - 'tbr': 5380, - 'width': 720, - 'height': 540, - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, display_id = mobj.group('id', 'display_id') - - webpage = self._download_webpage(url, display_id) - - title = self._search_regex( - r'

    ([^<]+)', webpage, 'title', - default=None) or self._html_search_meta( - 'ya:ovs:original_name', webpage, 'title', fatal=True) - - loc = self._search_regex( - r'PCTMLOC\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'loc', - group='value') - - loc_b64 = '' - for c in loc: - c_ord = ord(c) - if ord('a') <= c_ord <= ord('z') or ord('A') <= c_ord <= ord('Z'): - upper = ord('Z') if c_ord <= ord('Z') else ord('z') - c_ord += 13 - if upper < c_ord: - c_ord -= 26 - loc_b64 += compat_chr(c_ord) - - video_url = compat_b64decode(loc_b64).decode('utf-8') - - description = self._html_search_regex( - r'(?s)]+class=["\']pt-movie-desc[^>]+>(.+?)', webpage, - 'description', fatal=False) - - thumbnail = self._search_regex( - r']+class=["\']video-preview[^>]+\bsrc=(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'thumbnail', default=None, - group='value') or self._og_search_thumbnail(webpage) - - creator = self._html_search_meta( - 'video:director', webpage, 'creator', default=None) - - release_date = self._html_search_meta( - 'video:release_date', webpage, default=None) - if release_date: - release_date = release_date.replace('-', '') - - def int_meta(name): - return int_or_none(self._html_search_meta( - name, webpage, default=None)) - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'creator': creator, - 'release_date': release_date, - 'duration': int_meta('video:duration'), - 'tbr': int_meta('ya:ovs:bitrate'), - 'width': int_meta('og:video:width'), - 'height': int_meta('og:video:height'), - 'http_headers': { - 'Referer': url, - }, - } diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index c6052ac9f9..27d65d4b9c 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -8,7 +8,6 @@ from ..utils import ( ExtractorError, int_or_none, js_to_json, - merge_dicts, urljoin, ) @@ -28,22 +27,23 @@ class PornHdIE(InfoExtractor): 'view_count': int, 'like_count': int, 'age_limit': 18, - }, - 'skip': 'HTTP Error 404: Not Found', + } }, { + # removed video 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', - 'md5': '1b7b3a40b9d65a8e5b25f7ab9ee6d6de', + 'md5': '956b8ca569f7f4d8ec563e2c41598441', 'info_dict': { 'id': '1962', 'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video', 'ext': 'mp4', - 'title': 'md5:98c6f8b2d9c229d0f0fde47f61a1a759', + 'title': 'Sierra loves doing laundry', 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, 'like_count': int, 'age_limit': 18, }, + 'skip': 'Not available anymore', }] def _real_extract(self, url): @@ -61,13 +61,7 @@ class PornHdIE(InfoExtractor): r"(?s)sources'?\s*[:=]\s*(\{.+?\})", webpage, 'sources', default='{}')), video_id) - info = {} if not sources: - entries = self._parse_html5_media_entries(url, webpage, video_id) - if entries: - info = entries[0] - - if not sources and not info: message = self._html_search_regex( r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P.+?)]+class=["\']video-description[^>]+>(?P.+?)', - r'<(div|p)[^>]+class="description"[^>]*>(?P[^<]+)]+class="description"[^>]*>(?P[^<]+)(?:(?!\1).)+)\1", webpage, - 'thumbnail', default=None, group='url') + 'thumbnail', fatal=False, group='url') like_count = int_or_none(self._search_regex( - (r'(\d+)\s*likes', - r'(\d+)\s*]+>(?: |\s)*\blikes', + (r'(\d+)\s*]+>(?: |\s)*\blikes', r'class=["\']save-count["\'][^>]*>\s*(\d+)'), webpage, 'like count', fatal=False)) - return merge_dicts(info, { + return { 'id': video_id, 'display_id': display_id, 'title': title, @@ -118,4 +106,4 @@ class PornHdIE(InfoExtractor): 'like_count': like_count, 'formats': formats, 'age_limit': 18, - }) + } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 3567a32839..ba0ad7da29 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -17,7 +17,6 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, - NO_DEFAULT, orderedSet, remove_quotes, str_to_int, @@ -52,7 +51,7 @@ class PornHubIE(PornHubBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)?(?Ppornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P[\da-z]+) @@ -149,9 +148,6 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', 'only_matching': True, - }, { - 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', - 'only_matching': True, }] @staticmethod @@ -169,13 +165,6 @@ class PornHubIE(PornHubBaseIE): host = mobj.group('host') or 'pornhub.com' video_id = mobj.group('id') - if 'premium' in host: - if not self._downloader.params.get('cookiefile'): - raise ExtractorError( - 'PornHub Premium requires authentication.' - ' You may want to use --cookies.', - expected=True) - self._set_cookie(host, 'age_verified', '1') def dl_webpage(platform): @@ -199,10 +188,10 @@ class PornHubIE(PornHubBaseIE): # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # on that anymore. title = self._html_search_meta( - 'twitter:title', webpage, default=None) or self._html_search_regex( - (r'(?s)]+class=["\']title["\'][^>]*>(?P.+?)</h1>', - r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1', - r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'), + 'twitter:title', webpage, default=None) or self._search_regex( + (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)', + r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1', + r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'), webpage, 'title', group='title') video_urls = [] @@ -238,13 +227,12 @@ class PornHubIE(PornHubBaseIE): else: thumbnail, duration = [None] * 2 - def extract_js_vars(webpage, pattern, default=NO_DEFAULT): - assignments = self._search_regex( - pattern, webpage, 'encoded url', default=default) - if not assignments: - return {} + if not video_urls: + tv_webpage = dl_webpage('tv') - assignments = assignments.split(';') + assignments = self._search_regex( + r'(var.+?mediastring.+?)</script>', tv_webpage, + 'encoded url').split(';') js_vars = {} @@ -266,35 +254,11 @@ class PornHubIE(PornHubBaseIE): assn = re.sub(r'var\s+', '', assn) vname, value = assn.split('=', 1) js_vars[vname] = parse_js_value(value) - return js_vars - def add_video_url(video_url): - v_url = url_or_none(video_url) - if not v_url: - return - if v_url in video_urls_set: - return - video_urls.append((v_url, None)) - video_urls_set.add(v_url) - - if not video_urls: - FORMAT_PREFIXES = ('media', 'quality') - js_vars = extract_js_vars( - webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), - default=None) - if js_vars: - for key, format_url in js_vars.items(): - if any(key.startswith(p) for p in FORMAT_PREFIXES): - add_video_url(format_url) - if not video_urls and re.search( - r'<[^>]+\bid=["\']lockedPlayer', webpage): - raise ExtractorError( - 'Video %s is locked' % video_id, expected=True) - - if not video_urls: - js_vars = extract_js_vars( - dl_webpage('tv'), r'(var.+?mediastring.+?)</script>') - add_video_url(js_vars['mediastring']) + video_url = js_vars['mediastring'] + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) for mobj in re.finditer( r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1', @@ -312,16 +276,10 @@ class PornHubIE(PornHubBaseIE): r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) if upload_date: upload_date = upload_date.replace('/', '') - ext = determine_ext(video_url) - if ext == 'mpd': + if determine_ext(video_url) == 'mpd': formats.extend(self._extract_mpd_formats( video_url, video_id, mpd_id='dash', fatal=False)) continue - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue tbr = None mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url) if mobj: @@ -415,7 +373,7 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): class PornHubUserIE(PornHubPlaylistBaseIE): - _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph', 'playlist_mincount': 118, @@ -483,7 +441,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph/videos', 'only_matching': True, @@ -598,7 +556,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' _TESTS = [{ 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'info_dict': { diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index e470882922..e19a470a5e 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -11,13 +11,12 @@ from ..utils import ( determine_ext, float_or_none, int_or_none, - merge_dicts, unified_strdate, ) class ProSiebenSat1BaseIE(InfoExtractor): - _GEO_BYPASS = False + _GEO_COUNTRIES = ['DE'] _ACCESS_ID = None _SUPPORTED_PROTOCOLS = 'dash:clear,hls:clear,progressive:clear' _V4_BASE_URL = 'https://vas-v4.p7s1video.net/4.0/get' @@ -40,18 +39,14 @@ class ProSiebenSat1BaseIE(InfoExtractor): formats = [] if self._ACCESS_ID: raw_ct = self._ENCRYPTION_KEY + clip_id + self._IV + self._ACCESS_ID - protocols = self._download_json( + server_token = (self._download_json( self._V4_BASE_URL + 'protocols', clip_id, 'Downloading protocols JSON', headers=self.geo_verification_headers(), query={ 'access_id': self._ACCESS_ID, 'client_token': sha1((raw_ct).encode()).hexdigest(), 'video_id': clip_id, - }, fatal=False, expected_status=(403,)) or {} - error = protocols.get('error') or {} - if error.get('title') == 'Geo check failed': - self.raise_geo_restricted(countries=['AT', 'CH', 'DE']) - server_token = protocols.get('server_token') + }, fatal=False) or {}).get('server_token') if server_token: urls = (self._download_json( self._V4_BASE_URL + 'urls', clip_id, 'Downloading urls JSON', query={ @@ -176,7 +171,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): (?: (?:beta\.)? (?: - prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|advopedia + prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|7tv|advopedia )\.(?:de|at|ch)| ran\.de|fem\.com|advopedia\.de|galileo\.tv/video ) @@ -194,14 +189,10 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): 'info_dict': { 'id': '2104602', 'ext': 'mp4', - 'title': 'CIRCUS HALLIGALLI - Episode 18 - Staffel 2', + 'title': 'Episode 18 - Staffel 2', 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', 'upload_date': '20131231', 'duration': 5845.04, - 'series': 'CIRCUS HALLIGALLI', - 'season_number': 2, - 'episode': 'Episode 18 - Staffel 2', - 'episode_number': 18, }, }, { @@ -305,9 +296,8 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): 'info_dict': { 'id': '2572814', 'ext': 'mp4', - 'title': 'The Voice of Germany - Andreas Kümmert: Rocket Man', + 'title': 'Andreas Kümmert: Rocket Man', 'description': 'md5:6ddb02b0781c6adf778afea606652e38', - 'timestamp': 1382041620, 'upload_date': '20131017', 'duration': 469.88, }, @@ -316,7 +306,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): }, }, { - 'url': 'http://www.fem.com/videos/beauty-lifestyle/kurztrips-zum-valentinstag', + 'url': 'http://www.fem.com/wellness/videos/wellness-video-clip-kurztripps-zum-valentinstag.html', 'info_dict': { 'id': '2156342', 'ext': 'mp4', @@ -338,6 +328,19 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): 'playlist_count': 2, 'skip': 'This video is unavailable', }, + { + 'url': 'http://www.7tv.de/circus-halligalli/615-best-of-circus-halligalli-ganze-folge', + 'info_dict': { + 'id': '4187506', + 'ext': 'mp4', + 'title': 'Best of Circus HalliGalli', + 'description': 'md5:8849752efd90b9772c9db6fdf87fb9e9', + 'upload_date': '20151229', + }, + 'params': { + 'skip_download': True, + }, + }, { # title in <h2 class="subtitle"> 'url': 'http://www.prosieben.de/stars/oscar-award/videos/jetzt-erst-enthuellt-das-geheimnis-von-emma-stones-oscar-robe-clip', @@ -414,6 +417,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): r'<div[^>]+id="veeseoDescription"[^>]*>(.+?)</div>', ] _UPLOAD_DATE_REGEXES = [ + r'<meta property="og:published_time" content="(.+?)">', r'<span>\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*<span itemprop="duration"', r'<footer>\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr', r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>', @@ -443,21 +447,17 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): if description is None: description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) - upload_date = unified_strdate( - self._html_search_meta('og:published_time', webpage, - 'upload date', default=None) - or self._html_search_regex(self._UPLOAD_DATE_REGEXES, - webpage, 'upload date', default=None)) + upload_date = unified_strdate(self._html_search_regex( + self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) - json_ld = self._search_json_ld(webpage, clip_id, default={}) - - return merge_dicts(info, { + info.update({ 'id': clip_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, - }, json_ld) + }) + return info def _extract_playlist(self, url, webpage): playlist_id = self._html_search_regex( diff --git a/youtube_dl/extractor/puhutv.py b/youtube_dl/extractor/puhutv.py index ca71665e0f..fb704a3c43 100644 --- a/youtube_dl/extractor/puhutv.py +++ b/youtube_dl/extractor/puhutv.py @@ -82,6 +82,17 @@ class PuhuTVIE(InfoExtractor): urls = [] formats = [] + def add_http_from_hls(m3u8_f): + http_url = m3u8_f['url'].replace('/hls/', '/mp4/').replace('/chunklist.m3u8', '.mp4') + if http_url != m3u8_f['url']: + f = m3u8_f.copy() + f.update({ + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + 'url': http_url, + }) + formats.append(f) + for video in videos['data']['videos']: media_url = url_or_none(video.get('url')) if not media_url or media_url in urls: @@ -90,9 +101,12 @@ class PuhuTVIE(InfoExtractor): playlist = video.get('is_playlist') if (video.get('stream_type') == 'hls' and playlist is True) or 'playlist.m3u8' in media_url: - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( media_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + for m3u8_f in m3u8_formats: + formats.append(m3u8_f) + add_http_from_hls(m3u8_f) continue quality = int_or_none(video.get('quality')) @@ -114,6 +128,8 @@ class PuhuTVIE(InfoExtractor): format_id += '-%sp' % quality f['format_id'] = format_id formats.append(f) + if is_hls: + add_http_from_hls(f) self._sort_formats(formats) creator = try_get( diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 2d2f6a98c9..5c84028ef9 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -4,7 +4,6 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, ExtractorError, int_or_none, merge_dicts, @@ -44,21 +43,14 @@ class RedTubeIE(InfoExtractor): webpage = self._download_webpage( 'http://www.redtube.com/%s' % video_id, video_id) - ERRORS = ( - (('video-deleted-info', '>This video has been removed'), 'has been removed'), - (('private_video_text', '>This video is private', '>Send a friend request to its owner to be able to view it'), 'is private'), - ) - - for patterns, message in ERRORS: - if any(p in webpage for p in patterns): - raise ExtractorError( - 'Video %s %s' % (video_id, message), expected=True) + if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): + raise ExtractorError('Video %s has been removed' % video_id, expected=True) info = self._search_json_ld(webpage, video_id, default={}) if not info.get('title'): info['title'] = self._html_search_regex( - (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle|video_title)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>', + (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>', r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',), webpage, 'title', group='title', default=None) or self._og_search_title(webpage) @@ -78,7 +70,7 @@ class RedTubeIE(InfoExtractor): }) medias = self._parse_json( self._search_regex( - r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage, + r'mediaDefinition\s*:\s*(\[.+?\])', webpage, 'media definitions', default='{}'), video_id, fatal=False) if medias and isinstance(medias, list): @@ -86,12 +78,6 @@ class RedTubeIE(InfoExtractor): format_url = url_or_none(media.get('videoUrl')) if not format_url: continue - if media.get('format') == 'hls' or determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - continue format_id = media.get('quality') formats.append({ 'url': format_url, diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 2cc6651224..bd9ee1647d 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -8,6 +8,7 @@ from .common import InfoExtractor from ..compat import ( compat_parse_qs, + compat_str, compat_urlparse, ) from ..utils import ( @@ -38,13 +39,13 @@ class SafariBaseIE(InfoExtractor): 'Downloading login page') def is_logged(urlh): - return 'learning.oreilly.com/home/' in urlh.geturl() + return 'learning.oreilly.com/home/' in compat_str(urlh.geturl()) if is_logged(urlh): self.LOGGED_IN = True return - redirect_url = urlh.geturl() + redirect_url = compat_str(urlh.geturl()) parsed_url = compat_urlparse.urlparse(redirect_url) qs = compat_parse_qs(parsed_url.query) next_uri = compat_urlparse.urljoin( @@ -164,8 +165,7 @@ class SafariIE(SafariBaseIE): kaltura_session = self._download_json( '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id), video_id, 'Downloading kaltura session JSON', - 'Unable to download kaltura session JSON', fatal=False, - headers={'Accept': 'application/json'}) + 'Unable to download kaltura session JSON', fatal=False) if kaltura_session: session = kaltura_session.get('session') if session: diff --git a/youtube_dl/extractor/scrippsnetworks.py b/youtube_dl/extractor/scrippsnetworks.py index b40b4c4afd..8b3275735b 100644 --- a/youtube_dl/extractor/scrippsnetworks.py +++ b/youtube_dl/extractor/scrippsnetworks.py @@ -7,7 +7,6 @@ import re from .aws import AWSIE from .anvato import AnvatoIE -from .common import InfoExtractor from ..utils import ( smuggle_url, urlencode_postdata, @@ -103,50 +102,3 @@ class ScrippsNetworksWatchIE(AWSIE): 'anvato:anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a:%s' % mcp_id, {'geo_countries': ['US']}), AnvatoIE.ie_key(), video_id=mcp_id) - - -class ScrippsNetworksIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?P<site>cookingchanneltv|discovery|(?:diy|food)network|hgtv|travelchannel)\.com/videos/[0-9a-z-]+-(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.cookingchanneltv.com/videos/the-best-of-the-best-0260338', - 'info_dict': { - 'id': '0260338', - 'ext': 'mp4', - 'title': 'The Best of the Best', - 'description': 'Catch a new episode of MasterChef Canada Tuedsay at 9/8c.', - 'timestamp': 1475678834, - 'upload_date': '20161005', - 'uploader': 'SCNI-SCND', - }, - 'add_ie': ['ThePlatform'], - }, { - 'url': 'https://www.diynetwork.com/videos/diy-barnwood-tablet-stand-0265790', - 'only_matching': True, - }, { - 'url': 'https://www.foodnetwork.com/videos/chocolate-strawberry-cake-roll-7524591', - 'only_matching': True, - }, { - 'url': 'https://www.hgtv.com/videos/cookie-decorating-101-0301929', - 'only_matching': True, - }, { - 'url': 'https://www.travelchannel.com/videos/two-climates-one-bag-5302184', - 'only_matching': True, - }, { - 'url': 'https://www.discovery.com/videos/guardians-of-the-glades-cooking-with-tom-cobb-5578368', - 'only_matching': True, - }] - _ACCOUNT_MAP = { - 'cookingchanneltv': 2433005105, - 'discovery': 2706091867, - 'diynetwork': 2433004575, - 'foodnetwork': 2433005105, - 'hgtv': 2433004575, - 'travelchannel': 2433005739, - } - _TP_TEMPL = 'https://link.theplatform.com/s/ip77QC/media/guid/%d/%s?mbr=true' - - def _real_extract(self, url): - site, guid = re.match(self._VALID_URL, url).groups() - return self.url_result(smuggle_url( - self._TP_TEMPL % (self._ACCOUNT_MAP[site], guid), - {'force_smil_url': True}), 'ThePlatform', guid) diff --git a/youtube_dl/extractor/servus.py b/youtube_dl/extractor/servus.py index 9401bf2cf7..e579d42cf5 100644 --- a/youtube_dl/extractor/servus.py +++ b/youtube_dl/extractor/servus.py @@ -7,18 +7,9 @@ from .common import InfoExtractor class ServusIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?: - servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| - servustv\.com/videos - ) - /(?P<id>[aA]{2}-\w+|\d+-\d+) - ''' + _VALID_URL = r'https?://(?:www\.)?servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)/(?P<id>[aA]{2}-\w+|\d+-\d+)' _TESTS = [{ - # new URL schema - 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', + 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', @@ -27,10 +18,6 @@ class ServusIE(InfoExtractor): 'description': 'md5:1247204d85783afe3682644398ff2ec4', 'thumbnail': r're:^https?://.*\.jpg', } - }, { - # old URL schema - 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', - 'only_matching': True, }, { 'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/', 'only_matching': True, diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index d37c52543f..c2ee54457e 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -9,13 +9,10 @@ from .common import ( SearchInfoExtractor ) from ..compat import ( - compat_HTTPError, - compat_kwargs, compat_str, compat_urlparse, ) from ..utils import ( - error_to_compat_str, ExtractorError, float_or_none, HEADRequest, @@ -27,7 +24,6 @@ from ..utils import ( unified_timestamp, update_url_query, url_or_none, - urlhandle_detect_ext, ) @@ -97,7 +93,7 @@ class SoundcloudIE(InfoExtractor): 'repost_count': int, } }, - # geo-restricted + # not streamable song { 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', 'info_dict': { @@ -109,13 +105,18 @@ class SoundcloudIE(InfoExtractor): 'uploader_id': '9615865', 'timestamp': 1337635207, 'upload_date': '20120521', - 'duration': 227.155, + 'duration': 30, 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, }, + 'params': { + # rtmp + 'skip_download': True, + }, + 'skip': 'Preview', }, # private link { @@ -226,6 +227,7 @@ class SoundcloudIE(InfoExtractor): 'skip_download': True, }, }, + # not available via api.soundcloud.com/i1/tracks/id/streams { 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', @@ -234,7 +236,7 @@ class SoundcloudIE(InfoExtractor): 'ext': 'mp3', 'title': 'Mezzo Valzer', 'description': 'md5:4138d582f81866a530317bae316e8b61', - 'uploader': 'Micronie', + 'uploader': 'Giovanni Sarani', 'uploader_id': '3352531', 'timestamp': 1551394171, 'upload_date': '20190228', @@ -246,16 +248,14 @@ class SoundcloudIE(InfoExtractor): 'comment_count': int, 'repost_count': int, }, - }, - { - # with AAC HQ format available via OAuth token - 'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1', - 'only_matching': True, - }, + 'expected_warnings': ['Unable to download JSON metadata'], + } ] + _API_BASE = 'https://api.soundcloud.com/' _API_V2_BASE = 'https://api-v2.soundcloud.com/' _BASE_URL = 'https://soundcloud.com/' + _CLIENT_ID = 'UW9ajvMgVdMMW3cdeBi8lPfN6dvOVGji' _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' _ARTWORK_MAP = { @@ -271,53 +271,14 @@ class SoundcloudIE(InfoExtractor): 'original': 0, } - def _store_client_id(self, client_id): - self._downloader.cache.store('soundcloud', 'client_id', client_id) - - def _update_client_id(self): - webpage = self._download_webpage('https://soundcloud.com/', None) - for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)): - script = self._download_webpage(src, None, fatal=False) - if script: - client_id = self._search_regex( - r'client_id\s*:\s*"([0-9a-zA-Z]{32})"', - script, 'client id', default=None) - if client_id: - self._CLIENT_ID = client_id - self._store_client_id(client_id) - return - raise ExtractorError('Unable to extract client id') - - def _download_json(self, *args, **kwargs): - non_fatal = kwargs.get('fatal') is False - if non_fatal: - del kwargs['fatal'] - query = kwargs.get('query', {}).copy() - for _ in range(2): - query['client_id'] = self._CLIENT_ID - kwargs['query'] = query - try: - return super(SoundcloudIE, self)._download_json(*args, **compat_kwargs(kwargs)) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - self._store_client_id(None) - self._update_client_id() - continue - elif non_fatal: - self._downloader.report_warning(error_to_compat_str(e)) - return False - raise - - def _real_initialize(self): - self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'YUKXoArFcqrlQn9tfNHvvyfnDISj04zk' - @classmethod def _resolv_url(cls, url): - return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url + return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url + '&client_id=' + cls._CLIENT_ID - def _extract_info_dict(self, info, full_title=None, secret_token=None): + def _extract_info_dict(self, info, full_title=None, secret_token=None, version=2): track_id = compat_str(info['id']) title = info['title'] + track_base_url = self._API_BASE + 'tracks/%s' % track_id format_urls = set() formats = [] @@ -326,27 +287,26 @@ class SoundcloudIE(InfoExtractor): query['secret_token'] = secret_token if info.get('downloadable') and info.get('has_downloads_left'): - download_url = update_url_query( - self._API_V2_BASE + 'tracks/' + track_id + '/download', query) - redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') - if redirect_url: - urlh = self._request_webpage( - HEADRequest(redirect_url), track_id, fatal=False) - if urlh: - format_url = urlh.geturl() - format_urls.add(format_url) - formats.append({ - 'format_id': 'download', - 'ext': urlhandle_detect_ext(urlh) or 'mp3', - 'filesize': int_or_none(urlh.headers.get('Content-Length')), - 'url': format_url, - 'preference': 10, - }) + format_url = update_url_query( + info.get('download_url') or track_base_url + '/download', query) + format_urls.add(format_url) + if version == 2: + v1_info = self._download_json( + track_base_url, track_id, query=query, fatal=False) or {} + else: + v1_info = info + formats.append({ + 'format_id': 'download', + 'ext': v1_info.get('original_format') or 'mp3', + 'filesize': int_or_none(v1_info.get('original_content_size')), + 'url': format_url, + 'preference': 10, + }) def invalid_url(url): - return not url or url in format_urls + return not url or url in format_urls or re.search(r'/(?:preview|playlist)/0/30/', url) - def add_format(f, protocol, is_preview=False): + def add_format(f, protocol): mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url) if mobj: for k, v in mobj.groupdict().items(): @@ -355,27 +315,16 @@ class SoundcloudIE(InfoExtractor): format_id_list = [] if protocol: format_id_list.append(protocol) - ext = f.get('ext') - if ext == 'aac': - f['abr'] = '256' for k in ('ext', 'abr'): v = f.get(k) if v: format_id_list.append(v) - preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url']) - if preview: - format_id_list.append('preview') abr = f.get('abr') if abr: f['abr'] = int(abr) - if protocol == 'hls': - protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' - else: - protocol = 'http' f.update({ 'format_id': '_'.join(format_id_list), - 'protocol': protocol, - 'preference': -10 if preview else None, + 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', }) formats.append(f) @@ -386,7 +335,7 @@ class SoundcloudIE(InfoExtractor): if not isinstance(t, dict): continue format_url = url_or_none(t.get('url')) - if not format_url: + if not format_url or t.get('snipped') or '/preview/' in format_url: continue stream = self._download_json( format_url, track_id, query=query, fatal=False) @@ -409,14 +358,44 @@ class SoundcloudIE(InfoExtractor): add_format({ 'url': stream_url, 'ext': ext, - }, 'http' if protocol == 'progressive' else protocol, - t.get('snipped') or '/preview/' in format_url) + }, 'http' if protocol == 'progressive' else protocol) + + if not formats: + # Old API, does not work for some tracks (e.g. + # https://soundcloud.com/giovannisarani/mezzo-valzer) + # and might serve preview URLs (e.g. + # http://www.soundcloud.com/snbrn/ele) + format_dict = self._download_json( + track_base_url + '/streams', track_id, + 'Downloading track url', query=query, fatal=False) or {} + + for key, stream_url in format_dict.items(): + if invalid_url(stream_url): + continue + format_urls.add(stream_url) + mobj = re.search(r'(http|hls)_([^_]+)_(\d+)_url', key) + if mobj: + protocol, ext, abr = mobj.groups() + add_format({ + 'abr': abr, + 'ext': ext, + 'url': stream_url, + }, protocol) + + if not formats: + # We fallback to the stream_url in the original info, this + # cannot be always used, sometimes it can give an HTTP 404 error + urlh = self._request_webpage( + HEADRequest(info.get('stream_url') or track_base_url + '/stream'), + track_id, query=query, fatal=False) + if urlh: + stream_url = urlh.geturl() + if not invalid_url(stream_url): + add_format({'url': stream_url}, 'http') for f in formats: f['vcodec'] = 'none' - if not formats and info.get('policy') == 'BLOCK': - self.raise_geo_restricted() self._sort_formats(formats) user = info.get('user') or {} @@ -472,7 +451,9 @@ class SoundcloudIE(InfoExtractor): track_id = mobj.group('track_id') - query = {} + query = { + 'client_id': self._CLIENT_ID, + } if track_id: info_json_url = self._API_V2_BASE + 'tracks/' + track_id full_title = track_id @@ -486,24 +467,20 @@ class SoundcloudIE(InfoExtractor): resolve_title += '/%s' % token info_json_url = self._resolv_url(self._BASE_URL + resolve_title) + version = 2 info = self._download_json( - info_json_url, full_title, 'Downloading info JSON', query=query) + info_json_url, full_title, 'Downloading info JSON', query=query, fatal=False) + if not info: + info = self._download_json( + info_json_url.replace(self._API_V2_BASE, self._API_BASE), + full_title, 'Downloading info JSON', query=query) + version = 1 - return self._extract_info_dict(info, full_title, token) + return self._extract_info_dict(info, full_title, token, version) class SoundcloudPlaylistBaseIE(SoundcloudIE): - def _extract_set(self, playlist, token=None): - playlist_id = compat_str(playlist['id']) - tracks = playlist.get('tracks') or [] - if not all([t.get('permalink_url') for t in tracks]) and token: - tracks = self._download_json( - self._API_V2_BASE + 'tracks', playlist_id, - 'Downloading tracks', query={ - 'ids': ','.join([compat_str(t['id']) for t in tracks]), - 'playlistId': playlist_id, - 'playlistSecretToken': token, - }) + def _extract_track_entries(self, tracks, token=None): entries = [] for track in tracks: track_id = str_or_none(track.get('id')) @@ -516,10 +493,7 @@ class SoundcloudPlaylistBaseIE(SoundcloudIE): url += '?secret_token=' + token entries.append(self.url_result( url, SoundcloudIE.ie_key(), track_id)) - return self.playlist_result( - entries, playlist_id, - playlist.get('title'), - playlist.get('description')) + return entries class SoundcloudSetIE(SoundcloudPlaylistBaseIE): @@ -530,7 +504,6 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): 'info_dict': { 'id': '2284613', 'title': 'The Royal Concept EP', - 'description': 'md5:71d07087c7a449e8941a70a29e34671e', }, 'playlist_mincount': 5, }, { @@ -553,13 +526,17 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): msgs = (compat_str(err['error_message']) for err in info['errors']) raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) - return self._extract_set(info, token) + entries = self._extract_track_entries(info['tracks'], token) + + return self.playlist_result( + entries, str_or_none(info.get('id')), info.get('title')) -class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): +class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): def _extract_playlist(self, base_url, playlist_id, playlist_title): COMMON_QUERY = { - 'limit': 80000, + 'limit': 2000000000, + 'client_id': self._CLIENT_ID, 'linked_partitioning': '1', } @@ -745,7 +722,9 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') - query = {} + query = { + 'client_id': self._CLIENT_ID, + } token = mobj.group('token') if token: query['secret_token'] = token @@ -754,7 +733,10 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): self._API_V2_BASE + 'playlists/' + playlist_id, playlist_id, 'Downloading playlist', query=query) - return self._extract_set(data, token) + entries = self._extract_track_entries(data['tracks'], token) + + return self.playlist_result( + entries, playlist_id, data.get('title'), data.get('description')) class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): @@ -779,6 +761,7 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): self._MAX_RESULTS_PER_PAGE) query.update({ 'limit': limit, + 'client_id': self._CLIENT_ID, 'linked_partitioning': 1, 'offset': 0, }) diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 61ca902ce2..e040ada29b 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -4,7 +4,6 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, ExtractorError, merge_dicts, orderedSet, @@ -65,7 +64,7 @@ class SpankBangIE(InfoExtractor): url.replace('/%s/embed' % video_id, '/%s/video' % video_id), video_id, headers={'Cookie': 'country=US'}) - if re.search(r'<[^>]+\b(?:id|class)=["\']video_removed', webpage): + if re.search(r'<[^>]+\bid=["\']video_removed', webpage): raise ExtractorError( 'Video %s is not available' % video_id, expected=True) @@ -76,20 +75,11 @@ class SpankBangIE(InfoExtractor): if not f_url: return f = parse_resolution(format_id) - ext = determine_ext(f_url) - if format_id.startswith('m3u8') or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - f_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif format_id.startswith('mpd') or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - f_url, video_id, mpd_id='dash', fatal=False)) - elif ext == 'mp4' or f.get('width') or f.get('height'): - f.update({ - 'url': f_url, - 'format_id': format_id, - }) - formats.append(f) + f.update({ + 'url': f_url, + 'format_id': format_id, + }) + formats.append(f) STREAM_URL_PREFIX = 'stream_url_' @@ -103,22 +93,28 @@ class SpankBangIE(InfoExtractor): r'data-streamkey\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 'stream key', group='value') + sb_csrf_session = self._get_cookies( + 'https://spankbang.com')['sb_csrf_session'].value + stream = self._download_json( 'https://spankbang.com/api/videos/stream', video_id, 'Downloading stream JSON', data=urlencode_postdata({ 'id': stream_key, 'data': 0, + 'sb_csrf_session': sb_csrf_session, }), headers={ 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRFToken': sb_csrf_session, }) for format_id, format_url in stream.items(): - if format_url and isinstance(format_url, list): - format_url = format_url[0] - extract_format(format_id, format_url) + if format_id.startswith(STREAM_URL_PREFIX): + if format_url and isinstance(format_url, list): + format_url = format_url[0] + extract_format( + format_id[len(STREAM_URL_PREFIX):], format_url) - self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'tbr', 'format_id')) + self._sort_formats(formats) info = self._search_json_ld(webpage, video_id, default={}) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 35ab9ec375..44d8fa52f3 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -3,47 +3,34 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - float_or_none, - int_or_none, - merge_dicts, - str_or_none, - str_to_int, - url_or_none, +from ..compat import ( + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, ) +from ..utils import ( + sanitized_Request, + str_to_int, + unified_strdate, +) +from ..aes import aes_decrypt_text class SpankwireIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?spankwire\.com/ - (?: - [^/]+/video| - EmbedPlayer\.aspx/?\?.*?\bArticleId= - ) - (?P<id>\d+) - ''' + _VALID_URL = r'https?://(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<id>[0-9]+)/?)' _TESTS = [{ # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', - 'md5': '5aa0e4feef20aad82cbcae3aed7ab7cd', + 'md5': '8bbfde12b101204b39e4b9fe7eb67095', 'info_dict': { 'id': '103545', 'ext': 'mp4', 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', 'description': 'Crazy Bitch X rated music video.', - 'duration': 222, 'uploader': 'oreusz', 'uploader_id': '124697', - 'timestamp': 1178587885, - 'upload_date': '20070508', - 'average_rating': float, - 'view_count': int, - 'comment_count': int, + 'upload_date': '20070507', 'age_limit': 18, - 'categories': list, - 'tags': list, - }, + } }, { # download URL pattern: */mp4_<format_id>_<video_id>.mp4 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', @@ -58,125 +45,83 @@ class SpankwireIE(InfoExtractor): 'upload_date': '20150822', 'age_limit': 18, }, - 'params': { - 'proxy': '127.0.0.1:8118' - }, - 'skip': 'removed', - }, { - 'url': 'https://www.spankwire.com/EmbedPlayer.aspx/?ArticleId=156156&autostart=true', - 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)', - webpage) - def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') - video = self._download_json( - 'https://www.spankwire.com/api/video/%s.json' % video_id, video_id) + req = sanitized_Request('http://www.' + mobj.group('url')) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) - title = video['title'] + title = self._html_search_regex( + r'<h1>([^<]+)', webpage, 'title') + description = self._html_search_regex( + r'(?s)<div\s+id="descriptionContent">(.+?)</div>', + webpage, 'description', fatal=False) + thumbnail = self._html_search_regex( + r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']', + webpage, 'thumbnail', fatal=False) + + uploader = self._html_search_regex( + r'by:\s*<a [^>]*>(.+?)</a>', + webpage, 'uploader', fatal=False) + uploader_id = self._html_search_regex( + r'by:\s*<a href="/(?:user/viewProfile|Profile\.aspx)\?.*?UserId=(\d+).*?"', + webpage, 'uploader id', fatal=False) + upload_date = unified_strdate(self._html_search_regex( + r'</a> on (.+?) at \d+:\d+', + webpage, 'upload date', fatal=False)) + + view_count = str_to_int(self._html_search_regex( + r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>', + webpage, 'view count', fatal=False)) + comment_count = str_to_int(self._html_search_regex( + r'<span\s+id="spCommentCount"[^>]*>([\d,\.]+)</span>', + webpage, 'comment count', fatal=False)) + + videos = re.findall( + r'playerData\.cdnPath([0-9]{3,})\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage) + heights = [int(video[0]) for video in videos] + video_urls = list(map(compat_urllib_parse_unquote, [video[1] for video in videos])) + if webpage.find(r'flashvars\.encrypted = "true"') != -1: + password = self._search_regex( + r'flashvars\.video_title = "([^"]+)', + webpage, 'password').replace('+', ' ') + video_urls = list(map( + lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), + video_urls)) formats = [] - videos = video.get('videos') - if isinstance(videos, dict): - for format_id, format_url in videos.items(): - video_url = url_or_none(format_url) - if not format_url: - continue - height = int_or_none(self._search_regex( - r'(\d+)[pP]', format_id, 'height', default=None)) - m = re.search( - r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', video_url) - if m: - tbr = int(m.group('tbr')) - height = height or int(m.group('height')) - else: - tbr = None - formats.append({ - 'url': video_url, - 'format_id': '%dp' % height if height else format_id, - 'height': height, - 'tbr': tbr, - }) - m3u8_url = url_or_none(video.get('HLS')) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - self._sort_formats(formats, ('height', 'tbr', 'width', 'format_id')) - - view_count = str_to_int(video.get('viewed')) - - thumbnails = [] - for preference, t in enumerate(('', '2x'), start=0): - thumbnail_url = url_or_none(video.get('poster%s' % t)) - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'preference': preference, + for height, video_url in zip(heights, video_urls): + path = compat_urllib_parse_urlparse(video_url).path + m = re.search(r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', path) + if m: + tbr = int(m.group('tbr')) + height = int(m.group('height')) + else: + tbr = None + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height, + 'height': height, + 'tbr': tbr, }) + self._sort_formats(formats) - def extract_names(key): - entries_list = video.get(key) - if not isinstance(entries_list, list): - return - entries = [] - for entry in entries_list: - name = str_or_none(entry.get('name')) - if name: - entries.append(name) - return entries + age_limit = self._rta_search(webpage) - categories = extract_names('categories') - tags = extract_names('tags') - - uploader = None - info = {} - - webpage = self._download_webpage( - 'https://www.spankwire.com/_/video%s/' % video_id, video_id, - fatal=False) - if webpage: - info = self._search_json_ld(webpage, video_id, default={}) - thumbnail_url = None - if 'thumbnail' in info: - thumbnail_url = url_or_none(info['thumbnail']) - del info['thumbnail'] - if not thumbnail_url: - thumbnail_url = self._og_search_thumbnail(webpage) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - 'preference': 10, - }) - uploader = self._html_search_regex( - r'(?s)by\s*<a[^>]+\bclass=["\']uploaded__by[^>]*>(.+?)</a>', - webpage, 'uploader', fatal=False) - if not view_count: - view_count = str_to_int(self._search_regex( - r'data-views=["\']([\d,.]+)', webpage, 'view count', - fatal=False)) - - return merge_dicts({ + return { 'id': video_id, 'title': title, - 'description': video.get('description'), - 'duration': int_or_none(video.get('duration')), - 'thumbnails': thumbnails, + 'description': description, + 'thumbnail': thumbnail, 'uploader': uploader, - 'uploader_id': str_or_none(video.get('userId')), - 'timestamp': int_or_none(video.get('time_approved_on')), - 'average_rating': float_or_none(video.get('rating')), + 'uploader_id': uploader_id, + 'upload_date': upload_date, 'view_count': view_count, - 'comment_count': int_or_none(video.get('comments')), - 'age_limit': 18, - 'categories': categories, - 'tags': tags, + 'comment_count': comment_count, 'formats': formats, - }, info) + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index aabff7a3ce..7c11ea7aaf 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -8,10 +8,15 @@ class BellatorIE(MTVServicesInfoExtractor): _TESTS = [{ 'url': 'http://www.bellator.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg', 'info_dict': { - 'title': 'Michael Page vs. Evangelista Cyborg', - 'description': 'md5:0d917fc00ffd72dd92814963fc6cbb05', + 'id': 'b55e434e-fde1-4a98-b7cc-92003a034de4', + 'ext': 'mp4', + 'title': 'Douglas Lima vs. Paul Daley - Round 1', + 'description': 'md5:805a8dd29310fd611d32baba2f767885', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, - 'playlist_count': 3, }, { 'url': 'http://www.bellator.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page', 'only_matching': True, @@ -20,9 +25,6 @@ class BellatorIE(MTVServicesInfoExtractor): _FEED_URL = 'http://www.bellator.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] - def _extract_mgid(self, webpage): - return self._extract_triforce_mgid(webpage) - class ParamountNetworkIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index 378fc75686..a3c35a899a 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -13,18 +13,36 @@ from ..utils import ( class SportDeutschlandIE(InfoExtractor): _VALID_URL = r'https?://sportdeutschland\.tv/(?P<sport>[^/?#]+)/(?P<id>[^?#/]+)(?:$|[?#])' _TESTS = [{ - 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', + 'url': 'http://sportdeutschland.tv/badminton/live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen', 'info_dict': { - 'id': 're-live-deutsche-meisterschaften-2020-halbfinals', + 'id': 'live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen', 'ext': 'mp4', - 'title': 're:Re-live: Deutsche Meisterschaften 2020.*Halbfinals', - 'categories': ['Badminton-Deutschland'], + 'title': 're:Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen', + 'categories': ['Badminton'], 'view_count': int, - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': r're:Die Badminton-WM 2014 aus Kopenhagen bei Sportdeutschland\.TV', 'timestamp': int, - 'upload_date': '20200201', - 'description': 're:.*', # meaningless description for THIS video + 'upload_date': 're:^201408[23][0-9]$', }, + 'params': { + 'skip_download': 'Live stream', + }, + }, { + 'url': 'http://sportdeutschland.tv/li-ning-badminton-wm-2014/lee-li-ning-badminton-weltmeisterschaft-2014-kopenhagen-herren-einzel-wei-vs', + 'info_dict': { + 'id': 'lee-li-ning-badminton-weltmeisterschaft-2014-kopenhagen-herren-einzel-wei-vs', + 'ext': 'mp4', + 'upload_date': '20140825', + 'description': 'md5:60a20536b57cee7d9a4ec005e8687504', + 'timestamp': 1408976060, + 'duration': 2732, + 'title': 'Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen: Herren Einzel, Wei Lee vs. Keun Lee', + 'thumbnail': r're:^https?://.*\.jpg$', + 'view_count': int, + 'categories': ['Li-Ning Badminton WM 2014'], + + } }] def _real_extract(self, url): @@ -32,7 +50,7 @@ class SportDeutschlandIE(InfoExtractor): video_id = mobj.group('id') sport_id = mobj.group('sport') - api_url = 'https://proxy.vidibusdynamic.net/ssl/backend.sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( + api_url = 'http://proxy.vidibusdynamic.net/sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( sport_id, video_id) req = sanitized_Request(api_url, headers={ 'Accept': 'application/vnd.vidibus.v2.html+json', diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py index 359dadaa3c..28baf901c9 100644 --- a/youtube_dl/extractor/srmediathek.py +++ b/youtube_dl/extractor/srmediathek.py @@ -1,14 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -from .ard import ARDMediathekBaseIE +from .ard import ARDMediathekIE from ..utils import ( ExtractorError, get_element_by_attribute, ) -class SRMediathekIE(ARDMediathekBaseIE): +class SRMediathekIE(ARDMediathekIE): IE_NAME = 'sr:mediathek' IE_DESC = 'Saarländischer Rundfunk' _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' diff --git a/youtube_dl/extractor/stretchinternet.py b/youtube_dl/extractor/stretchinternet.py index 4dbead2ba4..ae2ac1b42f 100644 --- a/youtube_dl/extractor/stretchinternet.py +++ b/youtube_dl/extractor/stretchinternet.py @@ -5,28 +5,44 @@ from ..utils import int_or_none class StretchInternetIE(InfoExtractor): - _VALID_URL = r'https?://portal\.stretchinternet\.com/[^/]+/(?:portal|full)\.htm\?.*?\beventId=(?P<id>\d+)' + _VALID_URL = r'https?://portal\.stretchinternet\.com/[^/]+/portal\.htm\?.*?\beventId=(?P<id>\d+)' _TEST = { - 'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=573272&streamType=video', + 'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=313900&streamType=video', 'info_dict': { - 'id': '573272', + 'id': '313900', 'ext': 'mp4', - 'title': 'University of Mary Wrestling vs. Upper Iowa', - 'timestamp': 1575668361, - 'upload_date': '20191206', + 'title': 'Augustana (S.D.) Baseball vs University of Mary', + 'description': 'md5:7578478614aae3bdd4a90f578f787438', + 'timestamp': 1490468400, + 'upload_date': '20170325', } } def _real_extract(self, url): video_id = self._match_id(url) + stream = self._download_json( + 'https://neo-client.stretchinternet.com/streamservice/v1/media/stream/v%s' + % video_id, video_id) + + video_url = 'https://%s' % stream['source'] + event = self._download_json( - 'https://api.stretchinternet.com/trinity/event/tcg/' + video_id, - video_id)[0] + 'https://neo-client.stretchinternet.com/portal-ws/getEvent.json', + video_id, query={ + 'clientID': 99997, + 'eventID': video_id, + 'token': 'asdf', + })['event'] + + title = event.get('title') or event['mobileTitle'] + description = event.get('customText') + timestamp = int_or_none(event.get('longtime')) return { 'id': video_id, - 'title': event['title'], - 'timestamp': int_or_none(event.get('dateCreated'), 1000), - 'url': 'https://' + event['media'][0]['url'], + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'url': video_url, } diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index e12389cad8..0901c3163e 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -4,14 +4,19 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( determine_ext, dict_get, int_or_none, - str_or_none, + orderedSet, strip_or_none, try_get, + urljoin, + compat_str, ) @@ -232,23 +237,23 @@ class SVTPlayIE(SVTPlayBaseIE): class SVTSeriesIE(SVTPlayBaseIE): - _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P<id>[^/?&#]+)(?:.+?\btab=(?P<season_slug>[^&#]+))?' + _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P<id>[^/?&#]+)' _TESTS = [{ 'url': 'https://www.svtplay.se/rederiet', 'info_dict': { - 'id': '14445680', + 'id': 'rederiet', 'title': 'Rederiet', - 'description': 'md5:d9fdfff17f5d8f73468176ecd2836039', + 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', }, 'playlist_mincount': 318, }, { - 'url': 'https://www.svtplay.se/rederiet?tab=season-2-14445680', + 'url': 'https://www.svtplay.se/rederiet?tab=sasong2', 'info_dict': { - 'id': 'season-2-14445680', + 'id': 'rederiet-sasong2', 'title': 'Rederiet - Säsong 2', - 'description': 'md5:d9fdfff17f5d8f73468176ecd2836039', + 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', }, - 'playlist_mincount': 12, + 'playlist_count': 12, }] @classmethod @@ -256,87 +261,83 @@ class SVTSeriesIE(SVTPlayBaseIE): return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTSeriesIE, cls).suitable(url) def _real_extract(self, url): - series_slug, season_id = re.match(self._VALID_URL, url).groups() + series_id = self._match_id(url) - series = self._download_json( - 'https://api.svt.se/contento/graphql', series_slug, - 'Downloading series page', query={ - 'query': '''{ - listablesBySlug(slugs: ["%s"]) { - associatedContent(include: [productionPeriod, season]) { - items { - item { - ... on Episode { - videoSvtId - } - } - } - id - name - } - id - longDescription - name - shortDescription - } -}''' % series_slug, - })['data']['listablesBySlug'][0] + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + season_slug = qs.get('tab', [None])[0] + + if season_slug: + series_id += '-%s' % season_slug + + webpage = self._download_webpage( + url, series_id, 'Downloading series page') + + root = self._parse_json( + self._search_regex( + self._SVTPLAY_RE, webpage, 'content', group='json'), + series_id) season_name = None entries = [] - for season in series['associatedContent']: + for season in root['relatedVideoContent']['relatedVideosAccordion']: if not isinstance(season, dict): continue - if season_id: - if season.get('id') != season_id: + if season_slug: + if season.get('slug') != season_slug: continue season_name = season.get('name') - items = season.get('items') - if not isinstance(items, list): + videos = season.get('videos') + if not isinstance(videos, list): continue - for item in items: - video = item.get('item') or {} - content_id = video.get('videoSvtId') - if not content_id or not isinstance(content_id, compat_str): + for video in videos: + content_url = video.get('contentUrl') + if not content_url or not isinstance(content_url, compat_str): continue - entries.append(self.url_result( - 'svt:' + content_id, SVTPlayIE.ie_key(), content_id)) + entries.append( + self.url_result( + urljoin(url, content_url), + ie=SVTPlayIE.ie_key(), + video_title=video.get('title') + )) - title = series.get('name') - season_name = season_name or season_id + metadata = root.get('metaData') + if not isinstance(metadata, dict): + metadata = {} + + title = metadata.get('title') + season_name = season_name or season_slug if title and season_name: title = '%s - %s' % (title, season_name) - elif season_id: - title = season_id + elif season_slug: + title = season_slug return self.playlist_result( - entries, season_id or series.get('id'), title, - dict_get(series, ('longDescription', 'shortDescription'))) + entries, series_id, title, metadata.get('description')) class SVTPageIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?svt\.se/(?P<path>(?:[^/]+/)*(?P<id>[^/?&#]+))' + _VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/]+/)*(?P<id>[^/?&#]+)' _TESTS = [{ - 'url': 'https://www.svt.se/sport/ishockey/bakom-masken-lehners-kamp-mot-mental-ohalsa', + 'url': 'https://www.svt.se/sport/oseedat/guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', 'info_dict': { - 'id': '25298267', - 'title': 'Bakom masken – Lehners kamp mot mental ohälsa', + 'id': 'guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'title': 'GUIDE: Sommarträning du kan göra var och när du vill', }, - 'playlist_count': 4, + 'playlist_count': 7, }, { - 'url': 'https://www.svt.se/nyheter/utrikes/svenska-andrea-ar-en-mil-fran-branderna-i-kalifornien', + 'url': 'https://www.svt.se/nyheter/inrikes/ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', 'info_dict': { - 'id': '24243746', - 'title': 'Svenska Andrea redo att fly sitt hem i Kalifornien', + 'id': 'ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'title': 'Ebba Busch Thor har bara delvis rätt om ”no-go-zoner”', }, - 'playlist_count': 2, + 'playlist_count': 1, }, { # only programTitle 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', 'info_dict': { - 'id': '8439V2K', + 'id': '2900353', 'ext': 'mp4', 'title': 'Stjärnorna skojar till det - under SVT-intervjun', 'duration': 27, @@ -355,26 +356,16 @@ class SVTPageIE(InfoExtractor): return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) def _real_extract(self, url): - path, display_id = re.match(self._VALID_URL, url).groups() + playlist_id = self._match_id(url) - article = self._download_json( - 'https://api.svt.se/nss-api/page/' + path, display_id, - query={'q': 'articles'})['articles']['content'][0] + webpage = self._download_webpage(url, playlist_id) - entries = [] + entries = [ + self.url_result( + 'svt:%s' % video_id, ie=SVTPlayIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'data-video-id=["\'](\d+)', webpage))] - def _process_content(content): - if content.get('_type') in ('VIDEOCLIP', 'VIDEOEPISODE'): - video_id = compat_str(content['image']['svtId']) - entries.append(self.url_result( - 'svt:' + video_id, SVTPlayIE.ie_key(), video_id)) + title = strip_or_none(self._og_search_title(webpage, default=None)) - for media in article.get('media', []): - _process_content(media) - - for obj in article.get('structuredBody', []): - _process_content(obj.get('content') or {}) - - return self.playlist_result( - entries, str_or_none(article.get('id')), - strip_or_none(article.get('title'))) + return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index a75369dbe8..7d2e34b3bc 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -4,12 +4,11 @@ import re from .common import InfoExtractor from .wistia import WistiaIE +from ..compat import compat_str from ..utils import ( clean_html, ExtractorError, - int_or_none, get_element_by_class, - strip_or_none, urlencode_postdata, urljoin, ) @@ -21,8 +20,8 @@ class TeachableBaseIE(InfoExtractor): _SITES = { # Only notable ones here - 'v1.upskillcourses.com': 'upskill', - 'gns3.teachable.com': 'gns3', + 'upskillcourses.com': 'upskill', + 'academy.gns3.com': 'gns3', 'academyhacker.com': 'academyhacker', 'stackskills.com': 'stackskills', 'market.saleshacker.com': 'saleshacker', @@ -59,7 +58,7 @@ class TeachableBaseIE(InfoExtractor): self._logged_in = True return - login_url = urlh.geturl() + login_url = compat_str(urlh.geturl()) login_form = self._hidden_inputs(login_page) @@ -111,29 +110,27 @@ class TeachableIE(TeachableBaseIE): ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ - 'url': 'https://gns3.teachable.com/courses/gns3-certified-associate/lectures/6842364', + 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', 'info_dict': { - 'id': 'untlgzk1v7', - 'ext': 'bin', - 'title': 'Overview', - 'description': 'md5:071463ff08b86c208811130ea1c2464c', - 'duration': 736.4, - 'timestamp': 1542315762, - 'upload_date': '20181115', - 'chapter': 'Welcome', - 'chapter_number': 1, + 'id': 'uzw6zw58or', + 'ext': 'mp4', + 'title': 'Welcome to the Course!', + 'description': 'md5:65edb0affa582974de4625b9cdea1107', + 'duration': 138.763, + 'timestamp': 1479846621, + 'upload_date': '20161122', }, 'params': { 'skip_download': True, }, }, { - 'url': 'http://v1.upskillcourses.com/courses/119763/lectures/1747100', + 'url': 'http://upskillcourses.com/courses/119763/lectures/1747100', 'only_matching': True, }, { - 'url': 'https://gns3.teachable.com/courses/423415/lectures/6885939', + 'url': 'https://academy.gns3.com/courses/423415/lectures/6885939', 'only_matching': True, }, { - 'url': 'teachable:https://v1.upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'url': 'teachable:https://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', 'only_matching': True, }] @@ -163,51 +160,22 @@ class TeachableIE(TeachableBaseIE): webpage = self._download_webpage(url, video_id) - wistia_urls = WistiaIE._extract_urls(webpage) - if not wistia_urls: + wistia_url = WistiaIE._extract_url(webpage) + if not wistia_url: if any(re.search(p, webpage) for p in ( r'class=["\']lecture-contents-locked', r'>\s*Lecture contents locked', - r'id=["\']lecture-locked', - # https://academy.tailoredtutors.co.uk/courses/108779/lectures/1955313 - r'class=["\'](?:inner-)?lesson-locked', - r'>LESSON LOCKED<')): + r'id=["\']lecture-locked')): self.raise_login_required('Lecture contents locked') - raise ExtractorError('Unable to find video URL') title = self._og_search_title(webpage, default=None) - chapter = None - chapter_number = None - section_item = self._search_regex( - r'(?s)(?P<li><li[^>]+\bdata-lecture-id=["\']%s[^>]+>.+?</li>)' % video_id, - webpage, 'section item', default=None, group='li') - if section_item: - chapter_number = int_or_none(self._search_regex( - r'data-ss-position=["\'](\d+)', section_item, 'section id', - default=None)) - if chapter_number is not None: - sections = [] - for s in re.findall( - r'(?s)<div[^>]+\bclass=["\']section-title[^>]+>(.+?)</div>', webpage): - section = strip_or_none(clean_html(s)) - if not section: - sections = [] - break - sections.append(section) - if chapter_number <= len(sections): - chapter = sections[chapter_number - 1] - - entries = [{ + return { '_type': 'url_transparent', 'url': wistia_url, 'ie_key': WistiaIE.ie_key(), 'title': title, - 'chapter': chapter, - 'chapter_number': chapter_number, - } for wistia_url in wistia_urls] - - return self.playlist_result(entries, video_id, title) + } class TeachableCourseIE(TeachableBaseIE): @@ -219,20 +187,20 @@ class TeachableCourseIE(TeachableBaseIE): /(?:courses|p)/(?:enrolled/)?(?P<id>[^/?#&]+) ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ - 'url': 'http://v1.upskillcourses.com/courses/essential-web-developer-course/', + 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/', 'info_dict': { 'id': 'essential-web-developer-course', 'title': 'The Essential Web Developer Course (Free)', }, 'playlist_count': 192, }, { - 'url': 'http://v1.upskillcourses.com/courses/119763/', + 'url': 'http://upskillcourses.com/courses/119763/', 'only_matching': True, }, { - 'url': 'http://v1.upskillcourses.com/courses/enrolled/119763', + 'url': 'http://upskillcourses.com/courses/enrolled/119763', 'only_matching': True, }, { - 'url': 'https://gns3.teachable.com/courses/enrolled/423415', + 'url': 'https://academy.gns3.com/courses/enrolled/423415', 'only_matching': True, }, { 'url': 'teachable:https://learn.vrdev.school/p/gear-vr-developer-mini', diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index 3e1a7a9e60..33a72083bf 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -1,21 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from .jwplatform import JWPlatformIE from .nexx import NexxIE from ..compat import compat_urlparse -from ..utils import ( - NO_DEFAULT, - smuggle_url, -) class Tele5IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _GEO_COUNTRIES = ['DE'] _TESTS = [{ 'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1549416', 'info_dict': { @@ -28,21 +20,6 @@ class Tele5IE(InfoExtractor): 'params': { 'skip_download': True, }, - }, { - # jwplatform, nexx unavailable - 'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/', - 'info_dict': { - 'id': 'WJuiOlUp', - 'ext': 'mp4', - 'upload_date': '20200603', - 'timestamp': 1591214400, - 'title': 'Ghoul - Das Geheimnis des Friedhofmonsters', - 'description': 'md5:42002af1d887ff3d5b2b3ca1f8137d97', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [JWPlatformIE.ie_key()], }, { 'url': 'https://www.tele5.de/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191', 'only_matching': True, @@ -67,42 +44,14 @@ class Tele5IE(InfoExtractor): qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] - NEXX_ID_RE = r'\d{6,}' - JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}' - - def nexx_result(nexx_id): - return self.url_result( - 'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id, - ie=NexxIE.ie_key(), video_id=nexx_id) - - nexx_id = jwplatform_id = None - - if video_id: - if re.match(NEXX_ID_RE, video_id): - return nexx_result(video_id) - elif re.match(JWPLATFORM_ID_RE, video_id): - jwplatform_id = video_id - - if not nexx_id: + if not video_id: display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - - def extract_id(pattern, name, default=NO_DEFAULT): - return self._html_search_regex( - (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern, - r'\s+id\s*=\s*["\']player_(%s)' % pattern, - r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, - default=default) - - nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None) - if nexx_id: - return nexx_result(nexx_id) - - if not jwplatform_id: - jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') + video_id = self._html_search_regex( + (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](\d+)', + r'\s+id\s*=\s*["\']player_(\d{6,})', + r'\bdata-id\s*=\s*["\'](\d{6,})'), webpage, 'video id') return self.url_result( - smuggle_url( - 'jwplatform:%s' % jwplatform_id, - {'geo_countries': self._GEO_COUNTRIES}), - ie=JWPlatformIE.ie_key(), video_id=jwplatform_id) + 'https://api.nexx.cloud/v3/759/videos/byid/%s' % video_id, + ie=NexxIE.ie_key(), video_id=video_id) diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index 9ba3da341d..d37e1b0557 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -11,7 +11,6 @@ from ..utils import ( determine_ext, int_or_none, str_or_none, - try_get, urljoin, ) @@ -25,7 +24,7 @@ class TelecincoIE(InfoExtractor): 'info_dict': { 'id': '1876350223', 'title': 'Bacalao con kokotxas al pil-pil', - 'description': 'md5:716caf5601e25c3c5ab6605b1ae71529', + 'description': 'md5:1382dacd32dd4592d478cbdca458e5bb', }, 'playlist': [{ 'md5': 'adb28c37238b675dad0f042292f209a7', @@ -56,26 +55,6 @@ class TelecincoIE(InfoExtractor): 'description': 'md5:2771356ff7bfad9179c5f5cd954f1477', 'duration': 50, }, - }, { - # video in opening's content - 'url': 'https://www.telecinco.es/vivalavida/fiorella-sobrina-edmundo-arrocet-entrevista_18_2907195140.html', - 'info_dict': { - 'id': '2907195140', - 'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"', - 'description': 'md5:73f340a7320143d37ab895375b2bf13a', - }, - 'playlist': [{ - 'md5': 'adb28c37238b675dad0f042292f209a7', - 'info_dict': { - 'id': 'TpI2EttSDAReWpJ1o0NVh2', - 'ext': 'mp4', - 'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"', - 'duration': 1015, - }, - }], - 'params': { - 'skip_download': True, - }, }, { 'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html', 'only_matching': True, @@ -156,28 +135,17 @@ class TelecincoIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) article = self._parse_json(self._search_regex( - r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=\s*({.+})', + r'window\.\$REACTBASE_STATE\.article\s*=\s*({.+})', webpage, 'article'), display_id)['article'] title = article.get('title') - description = clean_html(article.get('leadParagraph')) or '' + description = clean_html(article.get('leadParagraph')) if article.get('editorialType') != 'VID': entries = [] - body = [article.get('opening')] - body.extend(try_get(article, lambda x: x['body'], list) or []) - for p in body: - if not isinstance(p, dict): - continue + for p in article.get('body', []): content = p.get('content') - if not content: + if p.get('type') != 'video' or not content: continue - type_ = p.get('type') - if type_ == 'paragraph': - content_str = str_or_none(content) - if content_str: - description += content_str - continue - if type_ == 'video' and isinstance(content, dict): - entries.append(self._parse_content(content, url)) + entries.append(self._parse_content(content, url)) return self.playlist_result( entries, str_or_none(article.get('id')), title, description) content = article['opening']['content'] diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py index c82c94b3a0..ae9f667874 100644 --- a/youtube_dl/extractor/telequebec.py +++ b/youtube_dl/extractor/telequebec.py @@ -38,6 +38,8 @@ class TeleQuebecIE(TeleQuebecBaseIE): 'ext': 'mp4', 'title': 'Un petit choc et puis repart!', 'description': 'md5:b04a7e6b3f74e32d7b294cffe8658374', + 'upload_date': '20180222', + 'timestamp': 1519326631, }, 'params': { 'skip_download': True, diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py index af325fea8f..dff44a4e2f 100644 --- a/youtube_dl/extractor/tenplay.py +++ b/youtube_dl/extractor/tenplay.py @@ -10,8 +10,8 @@ from ..utils import ( class TenPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?P<id>tpv\d{6}[a-z]{5})' - _TESTS = [{ + _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/[^/]+/episodes/[^/]+/[^/]+/(?P<id>tpv\d{6}[a-z]{5})' + _TEST = { 'url': 'https://10play.com.au/masterchef/episodes/season-1/masterchef-s1-ep-1/tpv190718kwzga', 'info_dict': { 'id': '6060533435001', @@ -27,10 +27,7 @@ class TenPlayIE(InfoExtractor): 'format': 'bestvideo', 'skip_download': True, } - }, { - 'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc', - 'only_matching': True, - }] + } BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s' def _real_extract(self, url): diff --git a/youtube_dl/extractor/tfo.py b/youtube_dl/extractor/tfo.py index 0631cb7aba..0e2370cd82 100644 --- a/youtube_dl/extractor/tfo.py +++ b/youtube_dl/extractor/tfo.py @@ -17,12 +17,14 @@ class TFOIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tfo\.org/(?:en|fr)/(?:[^/]+/){2}(?P<id>\d+)' _TEST = { 'url': 'http://www.tfo.org/en/universe/tfo-247/100463871/video-game-hackathon', - 'md5': 'cafbe4f47a8dae0ca0159937878100d6', + 'md5': '47c987d0515561114cf03d1226a9d4c7', 'info_dict': { - 'id': '7da3d50e495c406b8fc0b997659cc075', + 'id': '100463871', 'ext': 'mp4', 'title': 'Video Game Hackathon', 'description': 'md5:558afeba217c6c8d96c60e5421795c07', + 'upload_date': '20160212', + 'timestamp': 1455310233, } } diff --git a/youtube_dl/extractor/thisoldhouse.py b/youtube_dl/extractor/thisoldhouse.py index a3d9b4017b..6ab147ad72 100644 --- a/youtube_dl/extractor/thisoldhouse.py +++ b/youtube_dl/extractor/thisoldhouse.py @@ -2,46 +2,43 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str +from ..utils import try_get class ThisOldHouseIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode|(?:[^/]+/)?\d+)/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode)/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench', + 'md5': '568acf9ca25a639f0c4ff905826b662f', 'info_dict': { - 'id': '5dcdddf673c3f956ef5db202', + 'id': '2REGtUDQ', 'ext': 'mp4', 'title': 'How to Build a Storage Bench', 'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.', 'timestamp': 1442548800, 'upload_date': '20150918', - }, - 'params': { - 'skip_download': True, - }, + } }, { 'url': 'https://www.thisoldhouse.com/watch/arlington-arts-crafts-arts-and-crafts-class-begins', 'only_matching': True, }, { 'url': 'https://www.thisoldhouse.com/tv-episode/ask-toh-shelf-rough-electric', 'only_matching': True, - }, { - 'url': 'https://www.thisoldhouse.com/furniture/21017078/how-to-build-a-storage-bench', - 'only_matching': True, - }, { - 'url': 'https://www.thisoldhouse.com/21113884/s41-e13-paradise-lost', - 'only_matching': True, - }, { - # iframe www.thisoldhouse.com - 'url': 'https://www.thisoldhouse.com/21083431/seaside-transformation-the-westerly-project', - 'only_matching': True, }] - _ZYPE_TMPL = 'https://player.zype.com/embed/%s.html?api_key=hsOk_yMSPYNrT22e9pu8hihLXjaZf0JW5jsOWv4ZqyHJFvkJn6rtToHl09tbbsbe' def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_id = self._search_regex( - r'<iframe[^>]+src=[\'"](?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})', - webpage, 'video id') - return self.url_result(self._ZYPE_TMPL % video_id, 'Zype', video_id) + (r'data-mid=(["\'])(?P<id>(?:(?!\1).)+)\1', + r'id=(["\'])inline-video-player-(?P<id>(?:(?!\1).)+)\1'), + webpage, 'video id', default=None, group='id') + if not video_id: + drupal_settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), display_id) + video_id = try_get( + drupal_settings, lambda x: x['jwplatform']['video_id'], + compat_str) or list(drupal_settings['comScore'])[0] + return self.url_result('jwplatform:' + video_id, 'JWPlatform', video_id) diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py index ca2e36efe4..5e5efda0f0 100644 --- a/youtube_dl/extractor/toggle.py +++ b/youtube_dl/extractor/toggle.py @@ -17,9 +17,9 @@ from ..utils import ( class ToggleIE(InfoExtractor): IE_NAME = 'toggle' - _VALID_URL = r'https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)' + _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'http://www.mewatch.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', + 'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', 'info_dict': { 'id': '343115', 'ext': 'mp4', @@ -33,7 +33,7 @@ class ToggleIE(InfoExtractor): } }, { 'note': 'DRM-protected video', - 'url': 'http://www.mewatch.sg/en/movies/dug-s-special-mission/341413', + 'url': 'http://video.toggle.sg/en/movies/dug-s-special-mission/341413', 'info_dict': { 'id': '341413', 'ext': 'wvm', @@ -48,7 +48,7 @@ class ToggleIE(InfoExtractor): }, { # this also tests correct video id extraction 'note': 'm3u8 links are geo-restricted, but Android/mp4 is okay', - 'url': 'http://www.mewatch.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861', + 'url': 'http://video.toggle.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861', 'info_dict': { 'id': '332861', 'ext': 'mp4', @@ -65,22 +65,19 @@ class ToggleIE(InfoExtractor): 'url': 'http://video.toggle.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331', 'only_matching': True, }, { - 'url': 'http://www.mewatch.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331', + 'url': 'http://video.toggle.sg/zh/series/zero-calling-s2-hd/ep13/336367', 'only_matching': True, }, { - 'url': 'http://www.mewatch.sg/zh/series/zero-calling-s2-hd/ep13/336367', + 'url': 'http://video.toggle.sg/en/series/vetri-s2/webisodes/jeeva-is-an-orphan-vetri-s2-webisode-7/342302', 'only_matching': True, }, { - 'url': 'http://www.mewatch.sg/en/series/vetri-s2/webisodes/jeeva-is-an-orphan-vetri-s2-webisode-7/342302', + 'url': 'http://video.toggle.sg/en/movies/seven-days/321936', 'only_matching': True, }, { - 'url': 'http://www.mewatch.sg/en/movies/seven-days/321936', + 'url': 'https://video.toggle.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456', 'only_matching': True, }, { - 'url': 'https://www.mewatch.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456', - 'only_matching': True, - }, { - 'url': 'http://www.mewatch.sg/en/channels/eleven-plus/401585', + 'url': 'http://video.toggle.sg/en/channels/eleven-plus/401585', 'only_matching': True, }] diff --git a/youtube_dl/extractor/trunews.py b/youtube_dl/extractor/trunews.py index cca5b5cebd..b0c7caabf3 100644 --- a/youtube_dl/extractor/trunews.py +++ b/youtube_dl/extractor/trunews.py @@ -1,12 +1,21 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + dict_get, + float_or_none, + int_or_none, + unified_timestamp, + update_url_query, + url_or_none, +) class TruNewsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?trunews\.com/stream/(?P<id>[^/?#&]+)' _TEST = { 'url': 'https://www.trunews.com/stream/will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech', + 'md5': 'a19c024c3906ff954fac9b96ce66bb08', 'info_dict': { 'id': '5c5a21e65d3c196e1c0020cc', 'display_id': 'will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech', @@ -19,16 +28,48 @@ class TruNewsIE(InfoExtractor): }, 'add_ie': ['Zype'], } - _ZYPE_TEMPL = 'https://player.zype.com/embed/%s.js?api_key=X5XnahkjCwJrT_l5zUqypnaLEObotyvtUKJWWlONxDoHVjP8vqxlArLV8llxMbyt' def _real_extract(self, url): display_id = self._match_id(url) - zype_id = self._download_json( + video = self._download_json( 'https://api.zype.com/videos', display_id, query={ 'app_key': 'PUVKp9WgGUb3-JUw6EqafLx8tFVP6VKZTWbUOR-HOm__g4fNDt1bCsm_LgYf_k9H', 'per_page': 1, 'active': 'true', 'friendly_title': display_id, - })['response'][0]['_id'] - return self.url_result(self._ZYPE_TEMPL % zype_id, 'Zype', zype_id) + })['response'][0] + + zype_id = video['_id'] + + thumbnails = [] + thumbnails_list = video.get('thumbnails') + if isinstance(thumbnails_list, list): + for thumbnail in thumbnails_list: + if not isinstance(thumbnail, dict): + continue + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + '_type': 'url_transparent', + 'url': update_url_query( + 'https://player.zype.com/embed/%s.js' % zype_id, + {'api_key': 'X5XnahkjCwJrT_l5zUqypnaLEObotyvtUKJWWlONxDoHVjP8vqxlArLV8llxMbyt'}), + 'ie_key': 'Zype', + 'id': zype_id, + 'display_id': display_id, + 'title': video.get('title'), + 'description': dict_get(video, ('description', 'ott_description', 'short_description')), + 'duration': int_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('published_at')), + 'average_rating': float_or_none(video.get('rating')), + 'view_count': int_or_none(video.get('request_count')), + 'thumbnails': thumbnails, + } diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index ae584ad697..edbb0aa694 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -150,7 +151,7 @@ class TumblrIE(InfoExtractor): url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage, urlh = self._download_webpage_handle(url, video_id) - redirect_url = urlh.geturl() + redirect_url = compat_str(urlh.geturl()) if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): raise ExtractorError( 'This Tumblr may contain sensitive media. ' diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py index 8bda9348d7..611fdc0c6c 100644 --- a/youtube_dl/extractor/tv2dk.py +++ b/youtube_dl/extractor/tv2dk.py @@ -106,7 +106,7 @@ class TV2DKBornholmPlayIE(InfoExtractor): video_id = self._match_id(url) video = self._download_json( - 'https://play.tv2bornholm.dk/controls/AJAX.aspx/specifikVideo', video_id, + 'http://play.tv2bornholm.dk/controls/AJAX.aspx/specifikVideo', video_id, data=json.dumps({ 'playlist_id': video_id, 'serienavn': '', diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index c498b01916..a819d048c6 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -99,7 +99,7 @@ class TV4IE(InfoExtractor): manifest_url.replace('.m3u8', '.f4m'), video_id, f4m_id='hds', fatal=False)) formats.extend(self._extract_ism_formats( - re.sub(r'\.ism/.*?\.m3u8', r'.ism/Manifest', manifest_url), + re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url), video_id, ism_id='mss', fatal=False)) if not formats and info.get('is_geo_restricted'): diff --git a/youtube_dl/extractor/tv5mondeplus.py b/youtube_dl/extractor/tv5mondeplus.py index b7fe082b9c..88b6baa316 100644 --- a/youtube_dl/extractor/tv5mondeplus.py +++ b/youtube_dl/extractor/tv5mondeplus.py @@ -3,51 +3,31 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + clean_html, determine_ext, extract_attributes, + get_element_by_class, int_or_none, parse_duration, + parse_iso8601, ) class TV5MondePlusIE(InfoExtractor): IE_DESC = 'TV5MONDE+' - _VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)' - _TESTS = [{ - # movie - 'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/rendez-vous-a-atlit', - 'md5': '8cbde5ea7b296cf635073e27895e227f', + _VALID_URL = r'https?://(?:www\.)?tv5mondeplus\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'http://www.tv5mondeplus.com/toutes-les-videos/documentaire/tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants', + 'md5': '12130fc199f020673138a83466542ec6', 'info_dict': { - 'id': '822a4756-0712-7329-1859-a13ac7fd1407', - 'display_id': 'rendez-vous-a-atlit', + 'id': 'tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants', 'ext': 'mp4', - 'title': 'Rendez-vous à Atlit', - 'description': 'md5:2893a4c5e1dbac3eedff2d87956e4efb', - 'upload_date': '20200130', - }, - }, { - # series episode - 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/c-est-la-vie-ennemie-juree', - 'info_dict': { - 'id': '0df7007c-4900-3936-c601-87a13a93a068', - 'display_id': 'c-est-la-vie-ennemie-juree', - 'ext': 'mp4', - 'title': "C'est la vie - Ennemie jurée", - 'description': 'md5:dfb5c63087b6f35fe0cc0af4fe44287e', - 'upload_date': '20200130', - 'series': "C'est la vie", - 'episode': 'Ennemie jurée', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/neuf-jours-en-hiver-neuf-jours-en-hiver', - 'only_matching': True, - }, { - 'url': 'https://revoir.tv5monde.com/toutes-les-videos/info-societe/le-journal-de-la-rts-edition-du-30-01-20-19h30', - 'only_matching': True, - }] + 'title': 'Tdah, mon amour - Enfants', + 'description': 'md5:230e3aca23115afcf8006d1bece6df74', + 'upload_date': '20170401', + 'timestamp': 1491022860, + } + } _GEO_BYPASS = False def _real_extract(self, url): @@ -57,7 +37,11 @@ class TV5MondePlusIE(InfoExtractor): if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage: self.raise_geo_restricted(countries=['FR']) - title = episode = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title') + series = get_element_by_class('video-detail__title', webpage) + title = episode = get_element_by_class( + 'video-detail__subtitle', webpage) or series + if series and series != title: + title = '%s - %s' % (series, title) vpl_data = extract_attributes(self._search_regex( r'(<[^>]+class="video_player_loader"[^>]+>)', webpage, 'video player loader')) @@ -81,37 +65,15 @@ class TV5MondePlusIE(InfoExtractor): }) self._sort_formats(formats) - description = self._html_search_regex( - r'(?s)<div[^>]+class=["\']episode-texte[^>]+>(.+?)</div>', webpage, - 'description', fatal=False) - - series = self._html_search_regex( - r'<p[^>]+class=["\']episode-emission[^>]+>([^<]+)', webpage, - 'series', default=None) - - if series and series != title: - title = '%s - %s' % (series, title) - - upload_date = self._search_regex( - r'(?:date_publication|publish_date)["\']\s*:\s*["\'](\d{4}_\d{2}_\d{2})', - webpage, 'upload date', default=None) - if upload_date: - upload_date = upload_date.replace('_', '') - - video_id = self._search_regex( - (r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', - r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id', - default=display_id) - return { - 'id': video_id, + 'id': display_id, 'display_id': display_id, 'title': title, - 'description': description, + 'description': clean_html(get_element_by_class('video-detail__description', webpage)), 'thumbnail': vpl_data.get('data-image'), 'duration': int_or_none(vpl_data.get('data-duration')) or parse_duration(self._html_search_meta('duration', webpage)), - 'upload_date': upload_date, + 'timestamp': parse_iso8601(self._html_search_meta('uploadDate', webpage)), 'formats': formats, - 'series': series, 'episode': episode, + 'series': series, } diff --git a/youtube_dl/extractor/tva.py b/youtube_dl/extractor/tva.py index 443f46e8a3..0b863df2ff 100644 --- a/youtube_dl/extractor/tva.py +++ b/youtube_dl/extractor/tva.py @@ -9,8 +9,8 @@ from ..utils import ( class TVAIE(InfoExtractor): - _VALID_URL = r'https?://videos?\.tva\.ca/details/_(?P<id>\d+)' - _TESTS = [{ + _VALID_URL = r'https?://videos\.tva\.ca/details/_(?P<id>\d+)' + _TEST = { 'url': 'https://videos.tva.ca/details/_5596811470001', 'info_dict': { 'id': '5596811470001', @@ -24,10 +24,7 @@ class TVAIE(InfoExtractor): # m3u8 download 'skip_download': True, } - }, { - 'url': 'https://video.tva.ca/details/_5596811470001', - 'only_matching': True, - }] + } BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5481942443001/default_default/index.html?videoId=%s' def _real_extract(self, url): diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 3c2450dd0c..d82d48f94e 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, + compat_str, compat_urlparse, ) from ..utils import ( @@ -14,7 +15,9 @@ from ..utils import ( int_or_none, parse_iso8601, qualities, + smuggle_url, try_get, + unsmuggle_url, update_url_query, url_or_none, ) @@ -232,6 +235,11 @@ class TVPlayIE(InfoExtractor): ] def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) + video_id = self._match_id(url) geo_country = self._search_regex( r'https?://[^/]+\.([a-z]{2})', url, @@ -277,6 +285,8 @@ class TVPlayIE(InfoExtractor): 'ext': ext, } if video_url.startswith('rtmp'): + if smuggled_data.get('skip_rtmp'): + continue m = re.search( r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', video_url) if not m: @@ -337,80 +347,115 @@ class ViafreeIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?:www\.)? - viafree\.(?P<country>dk|no|se) - /(?P<id>program(?:mer)?/(?:[^/]+/)+[^/?#&]+) + viafree\. + (?: + (?:dk|no)/programmer| + se/program + ) + /(?:[^/]+/)+(?P<id>[^/?#&]+) ''' _TESTS = [{ - 'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1', + 'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2', 'info_dict': { - 'id': '757786', + 'id': '395375', 'ext': 'mp4', - 'title': 'Det beste vorspielet - Sesong 2 - Episode 1', - 'description': 'md5:b632cb848331404ccacd8cd03e83b4c3', - 'series': 'Det beste vorspielet', + 'title': 'Husräddarna S02E02', + 'description': 'md5:4db5c933e37db629b5a2f75dfb34829e', + 'series': 'Husräddarna', + 'season': 'Säsong 2', 'season_number': 2, - 'duration': 1116, - 'timestamp': 1471200600, - 'upload_date': '20160814', + 'duration': 2576, + 'timestamp': 1400596321, + 'upload_date': '20140520', }, 'params': { 'skip_download': True, }, + 'add_ie': [TVPlayIE.ie_key()], }, { # with relatedClips 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-1', - 'only_matching': True, + 'info_dict': { + 'id': '758770', + 'ext': 'mp4', + 'title': 'Sommaren med YouTube-stjärnorna S01E01', + 'description': 'md5:2bc69dce2c4bb48391e858539bbb0e3f', + 'series': 'Sommaren med YouTube-stjärnorna', + 'season': 'Säsong 1', + 'season_number': 1, + 'duration': 1326, + 'timestamp': 1470905572, + 'upload_date': '20160811', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TVPlayIE.ie_key()], }, { # Different og:image URL schema 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-2', 'only_matching': True, }, { - 'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2', + 'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1', 'only_matching': True, }, { 'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5', 'only_matching': True, }] - _GEO_BYPASS = False @classmethod def suitable(cls, url): return False if TVPlayIE.suitable(url) else super(ViafreeIE, cls).suitable(url) def _real_extract(self, url): - country, path = re.match(self._VALID_URL, url).groups() - content = self._download_json( - 'https://viafree-content.mtg-api.com/viafree-content/v1/%s/path/%s' % (country, path), path) - program = content['_embedded']['viafreeBlocks'][0]['_embedded']['program'] - guid = program['guid'] - meta = content['meta'] - title = meta['title'] + video_id = self._match_id(url) - try: - stream_href = self._download_json( - program['_links']['streamLink']['href'], guid, - headers=self.geo_verification_headers())['embedded']['prioritizedStreams'][0]['links']['stream']['href'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self.raise_geo_restricted(countries=[country]) - raise + webpage = self._download_webpage(url, video_id) - formats = self._extract_m3u8_formats(stream_href, guid, 'mp4') - self._sort_formats(formats) - episode = program.get('episode') or {} + data = self._parse_json( + self._search_regex( + r'(?s)window\.App\s*=\s*({.+?})\s*;\s*</script', + webpage, 'data', default='{}'), + video_id, transform_source=lambda x: re.sub( + r'(?s)function\s+[a-zA-Z_][\da-zA-Z_]*\s*\([^)]*\)\s*{[^}]*}\s*', + 'null', x), fatal=False) - return { - 'id': guid, - 'title': title, - 'thumbnail': meta.get('image'), - 'description': meta.get('description'), - 'series': episode.get('seriesTitle'), - 'episode_number': int_or_none(episode.get('episodeNumber')), - 'season_number': int_or_none(episode.get('seasonNumber')), - 'duration': int_or_none(try_get(program, lambda x: x['video']['duration']['milliseconds']), 1000), - 'timestamp': parse_iso8601(try_get(program, lambda x: x['availability']['start'])), - 'formats': formats, - } + video_id = None + + if data: + video_id = try_get( + data, lambda x: x['context']['dispatcher']['stores'][ + 'ContentPageProgramStore']['currentVideo']['id'], + compat_str) + + # Fallback #1 (extract from og:image URL schema) + if not video_id: + thumbnail = self._og_search_thumbnail(webpage, default=None) + if thumbnail: + video_id = self._search_regex( + # Patterns seen: + # http://cdn.playapi.mtgx.tv/imagecache/600x315/cloud/content-images/inbox/765166/a2e95e5f1d735bab9f309fa345cc3f25.jpg + # http://cdn.playapi.mtgx.tv/imagecache/600x315/cloud/content-images/seasons/15204/758770/4a5ba509ca8bc043e1ebd1a76131cdf2.jpg + r'https?://[^/]+/imagecache/(?:[^/]+/)+(\d{6,})/', + thumbnail, 'video id', default=None) + + # Fallback #2. Extract from raw JSON string. + # May extract wrong video id if relatedClips is present. + if not video_id: + video_id = self._search_regex( + r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](\d{6,})', + webpage, 'video id') + + return self.url_result( + smuggle_url( + 'mtg:%s' % video_id, + { + 'geo_countries': [ + compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]], + # rtmp host mtgfs.fplive.net for viafree is unresolvable + 'skip_rtmp': True, + }), + ie=TVPlayIE.ie_key(), video_id=video_id) class TVPlayHomeIE(InfoExtractor): diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 74d14049b4..1d66eeaff6 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -17,8 +17,8 @@ class TwentyFourVideoIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?P<host> - (?:(?:www|porno?)\.)?24video\. - (?:net|me|xxx|sexy?|tube|adult|site|vip) + (?:(?:www|porno)\.)?24video\. + (?:net|me|xxx|sexy?|tube|adult|site) )/ (?: video/(?:(?:view|xml)/)?| @@ -59,12 +59,6 @@ class TwentyFourVideoIE(InfoExtractor): }, { 'url': 'https://porno.24video.net/video/2640421-vsya-takaya-gibkaya-i-v-masle', 'only_matching': True, - }, { - 'url': 'https://www.24video.vip/video/view/1044982', - 'only_matching': True, - }, { - 'url': 'https://porn.24video.net/video/2640421-vsya-takay', - 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index e211cd4c84..a8c2502af8 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -21,8 +21,6 @@ from ..utils import ( orderedSet, parse_duration, parse_iso8601, - qualities, - str_or_none, try_get, unified_timestamp, update_url_query, @@ -52,14 +50,8 @@ class TwitchBaseIE(InfoExtractor): def _call_api(self, path, item_id, *args, **kwargs): headers = kwargs.get('headers', {}).copy() - headers.update({ - 'Accept': 'application/vnd.twitchtv.v5+json; charset=UTF-8', - 'Client-ID': self._CLIENT_ID, - }) - kwargs.update({ - 'headers': headers, - 'expected_status': (400, 410), - }) + headers['Client-ID'] = self._CLIENT_ID + kwargs['headers'] = headers response = self._download_json( '%s/%s' % (self._API_BASE, path), item_id, *args, **compat_kwargs(kwargs)) @@ -194,27 +186,12 @@ class TwitchItemBaseIE(TwitchBaseIE): is_live = False else: is_live = None - _QUALITIES = ('small', 'medium', 'large') - quality_key = qualities(_QUALITIES) - thumbnails = [] - preview = info.get('preview') - if isinstance(preview, dict): - for thumbnail_id, thumbnail_url in preview.items(): - thumbnail_url = url_or_none(thumbnail_url) - if not thumbnail_url: - continue - if thumbnail_id not in _QUALITIES: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'preference': quality_key(thumbnail_id), - }) return { 'id': info['_id'], 'title': info.get('title') or 'Untitled Broadcast', 'description': info.get('description'), 'duration': int_or_none(info.get('length')), - 'thumbnails': thumbnails, + 'thumbnail': info.get('preview'), 'uploader': info.get('channel', {}).get('display_name'), 'uploader_id': info.get('channel', {}).get('name'), 'timestamp': parse_iso8601(info.get('recorded_at')), @@ -595,19 +572,11 @@ class TwitchStreamIE(TwitchBaseIE): else super(TwitchStreamIE, cls).suitable(url)) def _real_extract(self, url): - channel_name = self._match_id(url) - - access_token = self._call_api( - 'api/channels/%s/access_token' % channel_name, channel_name, - 'Downloading access token JSON') - - token = access_token['token'] - channel_id = compat_str(self._parse_json( - token, channel_name)['channel_id']) + channel_id = self._match_id(url) stream = self._call_api( - 'kraken/streams/%s?stream_type=all' % channel_id, - channel_id, 'Downloading stream JSON').get('stream') + 'kraken/streams/%s?stream_type=all' % channel_id, channel_id, + 'Downloading stream JSON').get('stream') if not stream: raise ExtractorError('%s is offline' % channel_id, expected=True) @@ -616,9 +585,11 @@ class TwitchStreamIE(TwitchBaseIE): # (e.g. http://www.twitch.tv/TWITCHPLAYSPOKEMON) that will lead to constructing # an invalid m3u8 URL. Working around by use of original channel name from stream # JSON and fallback to lowercase if it's not available. - channel_name = try_get( - stream, lambda x: x['channel']['name'], - compat_str) or channel_name.lower() + channel_id = stream.get('channel', {}).get('name') or channel_id.lower() + + access_token = self._call_api( + 'api/channels/%s/access_token' % channel_id, channel_id, + 'Downloading channel access token') query = { 'allow_source': 'true', @@ -629,11 +600,11 @@ class TwitchStreamIE(TwitchBaseIE): 'playlist_include_framerate': 'true', 'segment_preference': '4', 'sig': access_token['sig'].encode('utf-8'), - 'token': token.encode('utf-8'), + 'token': access_token['token'].encode('utf-8'), } formats = self._extract_m3u8_formats( '%s/api/channel/hls/%s.m3u8?%s' - % (self._USHER_BASE, channel_name, compat_urllib_parse_urlencode(query)), + % (self._USHER_BASE, channel_id, compat_urllib_parse_urlencode(query)), channel_id, 'mp4') self._prefer_source(formats) @@ -656,8 +627,8 @@ class TwitchStreamIE(TwitchBaseIE): }) return { - 'id': str_or_none(stream.get('_id')) or channel_id, - 'display_id': channel_name, + 'id': compat_str(stream['_id']), + 'display_id': channel_id, 'title': title, 'description': description, 'thumbnails': thumbnails, @@ -672,14 +643,7 @@ class TwitchStreamIE(TwitchBaseIE): class TwitchClipsIE(TwitchBaseIE): IE_NAME = 'twitch:clips' - _VALID_URL = r'''(?x) - https?:// - (?: - clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)| - (?:(?:www|go|m)\.)?twitch\.tv/[^/]+/clip/ - ) - (?P<id>[^/?#&]+) - ''' + _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', @@ -705,12 +669,6 @@ class TwitchClipsIE(TwitchBaseIE): }, { 'url': 'https://clips.twitch.tv/embed?clip=InquisitiveBreakableYogurtJebaited', 'only_matching': True, - }, { - 'url': 'https://m.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', - 'only_matching': True, - }, { - 'url': 'https://go.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', - 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 4284487db4..5f8d90fb4e 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -251,10 +251,10 @@ class TwitterIE(TwitterBaseIE): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'simon vetugo - BEAT PROD: @suhmeduh #Damndaniel', + 'title': 'Simon Vertugo - BEAT PROD: @suhmeduh #Damndaniel', 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'simon vetugo', + 'uploader': 'Simon Vertugo', 'uploader_id': 'simonvertugo', 'duration': 30.0, 'timestamp': 1455777459, @@ -376,10 +376,6 @@ class TwitterIE(TwitterBaseIE): # Twitch Clip Embed 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', 'only_matching': True, - }, { - # promo_video_website card - 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', - 'only_matching': True, }] def _real_extract(self, url): @@ -462,11 +458,10 @@ class TwitterIE(TwitterBaseIE): return try_get(o, lambda x: x[x['type'].lower() + '_value']) card_name = card['name'].split(':')[-1] - if card_name in ('amplify', 'promo_video_website'): - is_amplify = card_name == 'amplify' - vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') - content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) - formats = self._extract_formats_from_vmap_url(vmap_url, content_id or twid) + if card_name == 'amplify': + formats = self._extract_formats_from_vmap_url( + get_binding_value('amplify_url_vmap'), + get_binding_value('amplify_content_id') or twid) self._sort_formats(formats) thumbnails = [] @@ -578,18 +573,6 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): IE_NAME = 'twitter:broadcast' _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/broadcasts/(?P<id>[0-9a-zA-Z]{13})' - _TEST = { - # untitled Periscope video - 'url': 'https://twitter.com/i/broadcasts/1yNGaQLWpejGj', - 'info_dict': { - 'id': '1yNGaQLWpejGj', - 'ext': 'mp4', - 'title': 'Andrea May Sahouri - Periscope Broadcast', - 'uploader': 'Andrea May Sahouri', - 'uploader_id': '1PXEdBZWpGwKe', - }, - } - def _real_extract(self, url): broadcast_id = self._match_id(url) broadcast = self._call_api( diff --git a/youtube_dl/extractor/uol.py b/youtube_dl/extractor/uol.py index 628adf2199..08f0c072e2 100644 --- a/youtube_dl/extractor/uol.py +++ b/youtube_dl/extractor/uol.py @@ -2,17 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_urlencode, -) from ..utils import ( clean_html, int_or_none, parse_duration, - parse_iso8601, - qualities, update_url_query, + str_or_none, ) @@ -21,25 +16,21 @@ class UOLIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?uol\.com\.br/.*?(?:(?:mediaId|v)=|view/(?:[a-z0-9]+/)?|video(?:=|/(?:\d{4}/\d{2}/\d{2}/)?))(?P<id>\d+|[\w-]+-[A-Z0-9]+)' _TESTS = [{ 'url': 'http://player.mais.uol.com.br/player_video_v3.swf?mediaId=15951931', - 'md5': '4f1e26683979715ff64e4e29099cf020', + 'md5': '25291da27dc45e0afb5718a8603d3816', 'info_dict': { 'id': '15951931', 'ext': 'mp4', 'title': 'Miss simpatia é encontrada morta', 'description': 'md5:3f8c11a0c0556d66daf7e5b45ef823b2', - 'timestamp': 1470421860, - 'upload_date': '20160805', } }, { 'url': 'http://tvuol.uol.com.br/video/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326', - 'md5': '2850a0e8dfa0a7307e04a96c5bdc5bc2', + 'md5': 'e41a2fb7b7398a3a46b6af37b15c00c9', 'info_dict': { 'id': '15954259', 'ext': 'mp4', 'title': 'Incêndio destrói uma das maiores casas noturnas de Londres', 'description': 'Em Londres, um incêndio destruiu uma das maiores boates da cidade. Não há informações sobre vítimas.', - 'timestamp': 1470674520, - 'upload_date': '20160808', } }, { 'url': 'http://mais.uol.com.br/static/uolplayer/index.html?mediaId=15951931', @@ -64,55 +55,91 @@ class UOLIE(InfoExtractor): 'only_matching': True, }] + _FORMATS = { + '2': { + 'width': 640, + 'height': 360, + }, + '5': { + 'width': 1280, + 'height': 720, + }, + '6': { + 'width': 426, + 'height': 240, + }, + '7': { + 'width': 1920, + 'height': 1080, + }, + '8': { + 'width': 192, + 'height': 144, + }, + '9': { + 'width': 568, + 'height': 320, + }, + '11': { + 'width': 640, + 'height': 360, + } + } + def _real_extract(self, url): video_id = self._match_id(url) + media_id = None + + if video_id.isdigit(): + media_id = video_id + + if not media_id: + embed_page = self._download_webpage( + 'https://jsuol.com.br/c/tv/uol/embed/?params=[embed,%s]' % video_id, + video_id, 'Downloading embed page', fatal=False) + if embed_page: + media_id = self._search_regex( + (r'uol\.com\.br/(\d+)', r'mediaId=(\d+)'), + embed_page, 'media id', default=None) + + if not media_id: + webpage = self._download_webpage(url, video_id) + media_id = self._search_regex(r'mediaId=(\d+)', webpage, 'media id') video_data = self._download_json( - # https://api.mais.uol.com.br/apiuol/v4/player/data/[MEDIA_ID] - 'https://api.mais.uol.com.br/apiuol/v3/media/detail/' + video_id, - video_id)['item'] - media_id = compat_str(video_data['mediaId']) + 'http://mais.uol.com.br/apiuol/v3/player/getMedia/%s.json' % media_id, + media_id)['item'] title = video_data['title'] - ver = video_data.get('revision', 2) - uol_formats = self._download_json( - 'https://croupier.mais.uol.com.br/v3/formats/%s/jsonp' % media_id, - media_id) - quality = qualities(['mobile', 'WEBM', '360p', '720p', '1080p']) + query = { + 'ver': video_data.get('numRevision', 2), + 'r': 'http://mais.uol.com.br', + } + for k in ('token', 'sign'): + v = video_data.get(k) + if v: + query[k] = v + formats = [] - for format_id, f in uol_formats.items(): - if not isinstance(f, dict): - continue + for f in video_data.get('formats', []): f_url = f.get('url') or f.get('secureUrl') if not f_url: continue - query = { - 'ver': ver, - 'r': 'http://mais.uol.com.br', - } - for k in ('token', 'sign'): - v = f.get(k) - if v: - query[k] = v f_url = update_url_query(f_url, query) - format_id = format_id - if format_id == 'HLS': - m3u8_formats = self._extract_m3u8_formats( - f_url, media_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False) - encoded_query = compat_urllib_parse_urlencode(query) - for m3u8_f in m3u8_formats: - m3u8_f['extra_param_to_segment_url'] = encoded_query - m3u8_f['url'] = update_url_query(m3u8_f['url'], query) - formats.extend(m3u8_formats) + format_id = str_or_none(f.get('id')) + if format_id == '10': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) continue - formats.append({ + fmt = { 'format_id': format_id, 'url': f_url, - 'quality': quality(format_id), - 'preference': -1, - }) - self._sort_formats(formats) + 'source_preference': 1, + } + fmt.update(self._FORMATS.get(format_id, {})) + formats.append(fmt) + self._sort_formats(formats, ('height', 'width', 'source_preference', 'tbr', 'ext')) tags = [] for tag in video_data.get('tags', []): @@ -121,24 +148,12 @@ class UOLIE(InfoExtractor): continue tags.append(tag_description) - thumbnails = [] - for q in ('Small', 'Medium', 'Wmedium', 'Large', 'Wlarge', 'Xlarge'): - q_url = video_data.get('thumb' + q) - if not q_url: - continue - thumbnails.append({ - 'id': q, - 'url': q_url, - }) - return { 'id': media_id, 'title': title, - 'description': clean_html(video_data.get('description')), - 'thumbnails': thumbnails, - 'duration': parse_duration(video_data.get('duration')), + 'description': clean_html(video_data.get('desMedia')), + 'thumbnail': video_data.get('thumbnail'), + 'duration': int_or_none(video_data.get('durationSeconds')) or parse_duration(video_data.get('duration')), 'tags': tags, 'formats': formats, - 'timestamp': parse_iso8601(video_data.get('publishDate'), ' '), - 'view_count': int_or_none(video_data.get('viewsQtty')), } diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index e374995128..8fdfd743d0 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -1,50 +1,35 @@ # coding: utf-8 from __future__ import unicode_literals -import functools +import re +import time import hashlib import json import random -import re -import time from .adobepass import AdobePassIE -from .common import InfoExtractor from .youtube import YoutubeIE +from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_str, ) from ..utils import ( - clean_html, ExtractorError, int_or_none, - OnDemandPagedList, parse_age_limit, str_or_none, try_get, ) -class ViceBaseIE(InfoExtractor): - def _call_api(self, resource, resource_key, resource_id, locale, fields, args=''): - return self._download_json( - 'https://video.vice.com/api/v1/graphql', resource_id, query={ - 'query': '''{ - %s(locale: "%s", %s: "%s"%s) { - %s - } -}''' % (resource, locale, resource_key, resource_id, args, fields), - })['data'][resource] - - -class ViceIE(ViceBaseIE, AdobePassIE): +class ViceIE(AdobePassIE): IE_NAME = 'vice' - _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]{24})' + _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?viceland)\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]+)' _TESTS = [{ 'url': 'https://video.vice.com/en_us/video/pet-cremator/58c69e38a55424f1227dc3f7', 'info_dict': { - 'id': '58c69e38a55424f1227dc3f7', + 'id': '5e647f0125e145c9aef2069412c0cbde', 'ext': 'mp4', 'title': '10 Questions You Always Wanted To Ask: Pet Cremator', 'description': 'md5:fe856caacf61fe0e74fab15ce2b07ca5', @@ -58,16 +43,17 @@ class ViceIE(ViceBaseIE, AdobePassIE): # m3u8 download 'skip_download': True, }, + 'add_ie': ['UplynkPreplay'], }, { # geo restricted to US 'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56', 'info_dict': { - 'id': '5816510690b70e6c5fd39a56', + 'id': '930c0ad1f47141cc955087eecaddb0e2', 'ext': 'mp4', - 'uploader': 'vice', + 'uploader': 'waypoint', 'title': 'The Signal From Tölva', 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5', - 'uploader_id': '57a204088cb727dec794c67b', + 'uploader_id': '57f7d621e05ca860fa9ccaf9', 'timestamp': 1477941983, 'upload_date': '20161031', }, @@ -75,14 +61,15 @@ class ViceIE(ViceBaseIE, AdobePassIE): # m3u8 download 'skip_download': True, }, + 'add_ie': ['UplynkPreplay'], }, { 'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f', 'info_dict': { 'id': '581b12b60a0e1f4c0fb6ea2f', 'ext': 'mp4', 'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1', - 'description': 'Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.', - 'uploader': 'vice', + 'description': '<p>Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.</p>', + 'uploader': 'VICE', 'uploader_id': '57a204088cb727dec794c67b', 'timestamp': 1485368119, 'upload_date': '20170125', @@ -91,7 +78,9 @@ class ViceIE(ViceBaseIE, AdobePassIE): 'params': { # AES-encrypted m3u8 'skip_download': True, + 'proxy': '127.0.0.1:8118', }, + 'add_ie': ['UplynkPreplay'], }, { 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4', 'only_matching': True, @@ -109,7 +98,7 @@ class ViceIE(ViceBaseIE, AdobePassIE): @staticmethod def _extract_urls(webpage): return re.findall( - r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]{24})', + r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]+)', webpage) @staticmethod @@ -120,16 +109,31 @@ class ViceIE(ViceBaseIE, AdobePassIE): def _real_extract(self, url): locale, video_id = re.match(self._VALID_URL, url).groups() - video = self._call_api('videos', 'id', video_id, locale, '''body - locked - rating - thumbnail_url - title''')[0] - title = video['title'].strip() + webpage = self._download_webpage( + 'https://video.vice.com/%s/embed/%s' % (locale, video_id), + video_id) + + video = self._parse_json( + self._search_regex( + r'PREFETCH_DATA\s*=\s*({.+?})\s*;\s*\n', webpage, + 'app state'), video_id)['video'] + video_id = video.get('vms_id') or video.get('id') or video_id + title = video['title'] + is_locked = video.get('locked') rating = video.get('rating') + thumbnail = video.get('thumbnail_url') + duration = int_or_none(video.get('duration')) + series = try_get( + video, lambda x: x['episode']['season']['show']['title'], + compat_str) + episode_number = try_get( + video, lambda x: x['episode']['episode_number']) + season_number = try_get( + video, lambda x: x['episode']['season']['season_number']) + uploader = None query = {} - if video.get('locked'): + if is_locked: resource = self._get_mvpd_resource( 'VICELAND', title, video_id, rating) query['tvetoken'] = self._extract_mvpd_auth( @@ -144,9 +148,12 @@ class ViceIE(ViceBaseIE, AdobePassIE): query.update({ 'exp': exp, 'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(), - 'skipadstitching': 1, + '_ad_blocked': None, + '_ad_unit': '', + '_debug': '', 'platform': 'desktop', 'rn': random.randint(10000, 100000), + 'fbprebidtoken': '', }) try: @@ -162,94 +169,85 @@ class ViceIE(ViceBaseIE, AdobePassIE): raise video_data = preplay['video'] - formats = self._extract_m3u8_formats( - preplay['playURL'], video_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) - episode = video_data.get('episode') or {} - channel = video_data.get('channel') or {} - season = video_data.get('season') or {} + base = video_data['base'] + uplynk_preplay_url = preplay['preplayURL'] + episode = video_data.get('episode', {}) + channel = video_data.get('channel', {}) subtitles = {} - for subtitle in preplay.get('subtitleURLs', []): - cc_url = subtitle.get('url') - if not cc_url: - continue - language_code = try_get(subtitle, lambda x: x['languages'][0]['language_code'], compat_str) or 'en' - subtitles.setdefault(language_code, []).append({ + cc_url = preplay.get('ccURL') + if cc_url: + subtitles['en'] = [{ 'url': cc_url, - }) + }] return { - 'formats': formats, + '_type': 'url_transparent', + 'url': uplynk_preplay_url, 'id': video_id, 'title': title, - 'description': clean_html(video.get('body')), - 'thumbnail': video.get('thumbnail_url'), - 'duration': int_or_none(video_data.get('video_duration')), + 'description': base.get('body') or base.get('display_body'), + 'thumbnail': thumbnail, + 'duration': int_or_none(video_data.get('video_duration')) or duration, 'timestamp': int_or_none(video_data.get('created_at'), 1000), - 'age_limit': parse_age_limit(video_data.get('video_rating') or rating), - 'series': try_get(video_data, lambda x: x['show']['base']['display_title'], compat_str), - 'episode_number': int_or_none(episode.get('episode_number')), + 'age_limit': parse_age_limit(video_data.get('video_rating')), + 'series': video_data.get('show_title') or series, + 'episode_number': int_or_none(episode.get('episode_number') or episode_number), 'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')), - 'season_number': int_or_none(season.get('season_number')), - 'season_id': str_or_none(season.get('id') or video_data.get('season_id')), - 'uploader': channel.get('name'), + 'season_number': int_or_none(season_number), + 'season_id': str_or_none(episode.get('season_id')), + 'uploader': channel.get('base', {}).get('title') or channel.get('name') or uploader, 'uploader_id': str_or_none(channel.get('id')), 'subtitles': subtitles, + 'ie_key': 'UplynkPreplay', } -class ViceShowIE(ViceBaseIE): +class ViceShowIE(InfoExtractor): IE_NAME = 'vice:show' - _VALID_URL = r'https?://(?:video\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/show/(?P<id>[^/?#&]+)' - _PAGE_SIZE = 25 - _TESTS = [{ - 'url': 'https://video.vice.com/en_us/show/fck-thats-delicious', - 'info_dict': { - 'id': '57a2040c8cb727dec794c901', - 'title': 'F*ck, That’s Delicious', - 'description': 'The life and eating habits of rap’s greatest bon vivant, Action Bronson.', - }, - 'playlist_mincount': 64, - }, { - 'url': 'https://www.vicetv.com/en_us/show/fck-thats-delicious', - 'only_matching': True, - }] + _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)' - def _fetch_page(self, locale, show_id, page): - videos = self._call_api('videos', 'show_id', show_id, locale, '''body - id - url''', ', page: %d, per_page: %d' % (page + 1, self._PAGE_SIZE)) - for video in videos: - yield self.url_result( - video['url'], ViceIE.ie_key(), video.get('id')) + _TEST = { + 'url': 'https://munchies.vice.com/en/show/fuck-thats-delicious-2', + 'info_dict': { + 'id': 'fuck-thats-delicious-2', + 'title': "Fuck, That's Delicious", + 'description': 'Follow the culinary adventures of rapper Action Bronson during his ongoing world tour.', + }, + 'playlist_count': 17, + } def _real_extract(self, url): - locale, display_id = re.match(self._VALID_URL, url).groups() - show = self._call_api('shows', 'slug', display_id, locale, '''dek - id - title''')[0] - show_id = show['id'] + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) - entries = OnDemandPagedList( - functools.partial(self._fetch_page, locale, show_id), - self._PAGE_SIZE) + entries = [ + self.url_result(video_url, ViceIE.ie_key()) + for video_url, _ in re.findall( + r'<h2[^>]+class="article-title"[^>]+data-id="\d+"[^>]*>\s*<a[^>]+href="(%s.*?)"' + % ViceIE._VALID_URL, webpage)] - return self.playlist_result( - entries, show_id, show.get('title'), show.get('dek')) + title = self._search_regex( + r'<title>(.+?)', webpage, 'title', default=None) + if title: + title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip() + description = self._html_search_meta( + 'description', webpage, 'description') + + return self.playlist_result(entries, show_id, title, description) -class ViceArticleIE(ViceBaseIE): +class ViceArticleIE(InfoExtractor): IE_NAME = 'vice:article' - _VALID_URL = r'https://(?:www\.)?vice\.com/(?P[^/]+)/article/(?:[0-9a-z]{6}/)?(?P[^?#]+)' + _VALID_URL = r'https://www\.vice\.com/[^/]+/article/(?P[^?#]+)' _TESTS = [{ 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah', 'info_dict': { - 'id': '58dc0a3dee202d2a0ccfcbd8', + 'id': '41eae2a47b174a1398357cec55f1f6fc', 'ext': 'mp4', - 'title': 'Mormon War on Porn', - 'description': 'md5:1c5d91fe25fa8aa304f9def118b92dbf', + 'title': 'Mormon War on Porn ', + 'description': 'md5:6394a8398506581d0346b9ab89093fef', 'uploader': 'vice', 'uploader_id': '57a204088cb727dec794c67b', 'timestamp': 1491883129, @@ -260,10 +258,10 @@ class ViceArticleIE(ViceBaseIE): # AES-encrypted m3u8 'skip_download': True, }, - 'add_ie': [ViceIE.ie_key()], + 'add_ie': ['UplynkPreplay'], }, { 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car', - 'md5': '13010ee0bc694ea87ec40724397c2349', + 'md5': '7fe8ebc4fa3323efafc127b82bd821d9', 'info_dict': { 'id': '3jstaBeXgAs', 'ext': 'mp4', @@ -273,15 +271,15 @@ class ViceArticleIE(ViceBaseIE): 'uploader_id': 'MotherboardTV', 'upload_date': '20140529', }, - 'add_ie': [YoutubeIE.ie_key()], + 'add_ie': ['Youtube'], }, { 'url': 'https://www.vice.com/en_us/article/znm9dx/karley-sciortino-slutever-reloaded', 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', 'info_dict': { - 'id': '57f41d3556a0a80f54726060', + 'id': 'e2ed435eb67e43efb66e6ef9a6930a88', 'ext': 'mp4', 'title': "Making The World's First Male Sex Doll", - 'description': 'md5:19b00b215b99961cf869c40fbe9df755', + 'description': 'md5:916078ef0e032d76343116208b6cc2c4', 'uploader': 'vice', 'uploader_id': '57a204088cb727dec794c67b', 'timestamp': 1476919911, @@ -290,7 +288,6 @@ class ViceArticleIE(ViceBaseIE): }, 'params': { 'skip_download': True, - 'format': 'bestvideo', }, 'add_ie': [ViceIE.ie_key()], }, { @@ -302,11 +299,14 @@ class ViceArticleIE(ViceBaseIE): }] def _real_extract(self, url): - locale, display_id = re.match(self._VALID_URL, url).groups() + display_id = self._match_id(url) - article = self._call_api('articles', 'slug', display_id, locale, '''body - embed_code''')[0] - body = article['body'] + webpage = self._download_webpage(url, display_id) + + prefetch_data = self._parse_json(self._search_regex( + r'__APP_STATE\s*=\s*({.+?})(?:\s*\|\|\s*{}\s*)?;\s*\n', + webpage, 'app state'), display_id)['pageData'] + body = prefetch_data['body'] def _url_res(video_url, ie_key): return { @@ -316,7 +316,7 @@ class ViceArticleIE(ViceBaseIE): 'ie_key': ie_key, } - vice_url = ViceIE._extract_url(body) + vice_url = ViceIE._extract_url(webpage) if vice_url: return _url_res(vice_url, ViceIE.ie_key()) @@ -332,6 +332,6 @@ class ViceArticleIE(ViceBaseIE): video_url = self._html_search_regex( r'data-video-url="([^"]+)"', - article['embed_code'], 'video URL') + prefetch_data['embed_code'], 'video URL') return _url_res(video_url, ViceIE.ie_key()) diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index d6b92b1c83..851ad936cf 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -1,62 +1,28 @@ from __future__ import unicode_literals -import json +import base64 import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, + clean_html, + determine_ext, int_or_none, + js_to_json, parse_age_limit, + parse_duration, + try_get, ) class ViewLiftBaseIE(InfoExtractor): - _API_BASE = 'https://prod-api.viewlift.com/' - _DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm|failarmy|ftfnext|lnppass\.legapallacanestro|moviespree|app\.myoutdoortv|neoufitness|pflmma|theidentitytb)\.com|(?:hoichoi|app\.horseandcountry|kronon|marquee|supercrosslive)\.tv' - _SITE_MAP = { - 'ftfnext': 'lax', - 'funnyforfree': 'snagfilms', - 'hoichoi': 'hoichoitv', - 'kiddovid': 'snagfilms', - 'laxsportsnetwork': 'lax', - 'legapallacanestro': 'lnp', - 'marquee': 'marquee-tv', - 'monumentalsportsnetwork': 'monumental-network', - 'moviespree': 'bingeflix', - 'pflmma': 'pfl', - 'snagxtreme': 'snagfilms', - 'theidentitytb': 'tampabay', - 'vayafilm': 'snagfilms', - } - _TOKENS = {} - - def _call_api(self, site, path, video_id, query): - token = self._TOKENS.get(site) - if not token: - token_query = {'site': site} - email, password = self._get_login_info(netrc_machine=site) - if email: - resp = self._download_json( - self._API_BASE + 'identity/signin', video_id, - 'Logging in', query=token_query, data=json.dumps({ - 'email': email, - 'password': password, - }).encode()) - else: - resp = self._download_json( - self._API_BASE + 'identity/anonymous-token', video_id, - 'Downloading authorization token', query=token_query) - self._TOKENS[site] = token = resp['authorizationToken'] - return self._download_json( - self._API_BASE + path, video_id, - headers={'Authorization': token}, query=query) + _DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com|hoichoi\.tv' class ViewLiftEmbedIE(ViewLiftBaseIE): - IE_NAME = 'viewlift:embed' - _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?P%s)/embed/player\?.*\bfilmId=(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX + _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', 'md5': '2924e9215c6eff7a55ed35b72276bd93', @@ -64,9 +30,6 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'id': '74849a00-85a9-11e1-9660-123139220831', 'ext': 'mp4', 'title': '#whilewewatch', - 'description': 'md5:b542bef32a6f657dadd0df06e26fb0c8', - 'timestamp': 1334350096, - 'upload_date': '20120413', } }, { # invalid labels, 360p is better that 480p @@ -76,8 +39,7 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'id': '17ca0950-a74a-11e0-a92a-0026bb61d036', 'ext': 'mp4', 'title': 'Life in Limbo', - }, - 'skip': 'The video does not exist', + } }, { 'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017', 'only_matching': True, @@ -92,68 +54,67 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): return mobj.group('url') def _real_extract(self, url): - domain, film_id = re.match(self._VALID_URL, url).groups() - site = domain.split('.')[-2] - if site in self._SITE_MAP: - site = self._SITE_MAP[site] - try: - content_data = self._call_api( - site, 'entitlement/video/status', film_id, { - 'id': film_id - })['video'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error_message = self._parse_json(e.cause.read().decode(), film_id).get('errorMessage') - if error_message == 'User does not have a valid subscription or has not purchased this content.': - self.raise_login_required() - raise ExtractorError(error_message, expected=True) - raise - gist = content_data['gist'] - title = gist['title'] - video_assets = content_data['streamingInfo']['videoAssets'] + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + if '>This film is not playable in your area.<' in webpage: + raise ExtractorError( + 'Film %s is not playable in your area.' % video_id, expected=True) formats = [] - mpeg_video_assets = video_assets.get('mpeg') or [] - for video_asset in mpeg_video_assets: - video_asset_url = video_asset.get('url') - if not video_asset: + has_bitrate = False + sources = self._parse_json(self._search_regex( + r'(?s)sources:\s*(\[.+?\]),', webpage, + 'sources', default='[]'), video_id, js_to_json) + for source in sources: + file_ = source.get('file') + if not file_: continue - bitrate = int_or_none(video_asset.get('bitrate')) - height = int_or_none(self._search_regex( - r'^_?(\d+)[pP]$', video_asset.get('renditionValue'), - 'height', default=None)) - formats.append({ - 'url': video_asset_url, - 'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''), - 'tbr': bitrate, - 'height': height, - 'vcodec': video_asset.get('codec'), - }) + type_ = source.get('type') + ext = determine_ext(file_) + format_id = source.get('label') or ext + if all(v in ('m3u8', 'hls') for v in (type_, ext)): + formats.extend(self._extract_m3u8_formats( + file_, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + bitrate = int_or_none(self._search_regex( + [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], + file_, 'bitrate', default=None)) + if not has_bitrate and bitrate: + has_bitrate = True + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)) + formats.append({ + 'url': file_, + 'format_id': 'http-%s%s' % (format_id, ('-%dk' % bitrate if bitrate else '')), + 'tbr': bitrate, + 'height': height, + }) + if not formats: + hls_url = self._parse_json(self._search_regex( + r'filmInfo\.src\s*=\s*({.+?});', + webpage, 'src'), video_id, js_to_json)['src'] + formats = self._extract_m3u8_formats( + hls_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + field_preference = None if has_bitrate else ('height', 'tbr', 'format_id') + self._sort_formats(formats, field_preference) - hls_url = video_assets.get('hls') - if hls_url: - formats.extend(self._extract_m3u8_formats( - hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats, ('height', 'tbr', 'format_id')) + title = self._search_regex( + [r"title\s*:\s*'([^']+)'", r'([^<]+)'], + webpage, 'title') - info = { - 'id': film_id, + return { + 'id': video_id, 'title': title, - 'description': gist.get('description'), - 'thumbnail': gist.get('videoImageUrl'), - 'duration': int_or_none(gist.get('runtime')), - 'age_limit': parse_age_limit(content_data.get('parentalRating')), - 'timestamp': int_or_none(gist.get('publishDate'), 1000), 'formats': formats, } - for k in ('categories', 'tags'): - info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] - return info class ViewLiftIE(ViewLiftBaseIE): - IE_NAME = 'viewlift' - _VALID_URL = r'https?://(?:www\.)?(?P%s)(?P(?:/(?:films/title|show|(?:news/)?videos?|watch))?/(?P[^?#]+))' % ViewLiftBaseIE._DOMAINS_REGEX + _VALID_URL = r'https?://(?:www\.)?(?P%s)(?:/(?:films/title|show|(?:news/)?videos?))?/(?P[^?#]+)' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://www.snagfilms.com/films/title/lost_for_life', 'md5': '19844f897b35af219773fd63bdec2942', @@ -190,13 +151,10 @@ class ViewLiftIE(ViewLiftBaseIE): 'id': '00000148-7b53-de26-a9fb-fbf306f70020', 'display_id': 'augie_alone/s_2_ep_12_love', 'ext': 'mp4', - 'title': 'S. 2 Ep. 12 - Love', - 'description': 'Augie finds love.', + 'title': 'Augie, Alone:S. 2 Ep. 12 - Love', + 'description': 'md5:db2a5c72d994f16a780c1eb353a8f403', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 107, - 'upload_date': '20141012', - 'timestamp': 1413129540, - 'age_limit': 17, }, 'params': { 'skip_download': True, @@ -219,9 +177,6 @@ class ViewLiftIE(ViewLiftBaseIE): # Was once Kaltura embed 'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15', 'only_matching': True, - }, { - 'url': 'https://www.marquee.tv/watch/sadlerswells-sacredmonsters', - 'only_matching': True, }] @classmethod @@ -229,22 +184,119 @@ class ViewLiftIE(ViewLiftBaseIE): return False if ViewLiftEmbedIE.suitable(url) else super(ViewLiftIE, cls).suitable(url) def _real_extract(self, url): - domain, path, display_id = re.match(self._VALID_URL, url).groups() - site = domain.split('.')[-2] - if site in self._SITE_MAP: - site = self._SITE_MAP[site] - modules = self._call_api( - site, 'content/pages', display_id, { - 'includeContent': 'true', - 'moduleOffset': 1, - 'path': path, - 'site': site, - })['modules'] - film_id = next(m['contentData'][0]['gist']['id'] for m in modules if m.get('moduleType') == 'VideoDetailModule') - return { - '_type': 'url_transparent', - 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), - 'id': film_id, - 'display_id': display_id, - 'ie_key': 'ViewLiftEmbed', - } + domain, display_id = re.match(self._VALID_URL, url).groups() + + webpage = self._download_webpage(url, display_id) + + if ">Sorry, the Film you're looking for is not available.<" in webpage: + raise ExtractorError( + 'Film %s is not available.' % display_id, expected=True) + + initial_store_state = self._search_regex( + r"window\.initialStoreState\s*=.*?JSON\.parse\(unescape\(atob\('([^']+)'\)\)\)", + webpage, 'Initial Store State', default=None) + if initial_store_state: + modules = self._parse_json(compat_urllib_parse_unquote(base64.b64decode( + initial_store_state).decode()), display_id)['page']['data']['modules'] + content_data = next(m['contentData'][0] for m in modules if m.get('moduleType') == 'VideoDetailModule') + gist = content_data['gist'] + film_id = gist['id'] + title = gist['title'] + video_assets = try_get( + content_data, lambda x: x['streamingInfo']['videoAssets'], dict) + if not video_assets: + token = self._download_json( + 'https://prod-api.viewlift.com/identity/anonymous-token', + film_id, 'Downloading authorization token', + query={'site': 'snagfilms'})['authorizationToken'] + video_assets = self._download_json( + 'https://prod-api.viewlift.com/entitlement/video/status', + film_id, headers={ + 'Authorization': token, + 'Referer': url, + }, query={ + 'id': film_id + })['video']['streamingInfo']['videoAssets'] + + formats = [] + mpeg_video_assets = video_assets.get('mpeg') or [] + for video_asset in mpeg_video_assets: + video_asset_url = video_asset.get('url') + if not video_asset: + continue + bitrate = int_or_none(video_asset.get('bitrate')) + height = int_or_none(self._search_regex( + r'^_?(\d+)[pP]$', video_asset.get('renditionValue'), + 'height', default=None)) + formats.append({ + 'url': video_asset_url, + 'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''), + 'tbr': bitrate, + 'height': height, + 'vcodec': video_asset.get('codec'), + }) + + hls_url = video_assets.get('hls') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'format_id')) + + info = { + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': gist.get('description'), + 'thumbnail': gist.get('videoImageUrl'), + 'duration': int_or_none(gist.get('runtime')), + 'age_limit': parse_age_limit(content_data.get('parentalRating')), + 'timestamp': int_or_none(gist.get('publishDate'), 1000), + 'formats': formats, + } + for k in ('categories', 'tags'): + info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] + return info + else: + film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') + + snag = self._parse_json( + self._search_regex( + r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag', default='[]'), + display_id) + + for item in snag: + if item.get('data', {}).get('film', {}).get('id') == film_id: + data = item['data']['film'] + title = data['title'] + description = clean_html(data.get('synopsis')) + thumbnail = data.get('image') + duration = int_or_none(data.get('duration') or data.get('runtime')) + categories = [ + category['title'] for category in data.get('categories', []) + if category.get('title')] + break + else: + title = self._html_search_regex( + (r'itemprop="title">([^<]+)<', + r'(?s)itemprop="title">(.+?)(.+?)', + webpage, 'description', default=None) or self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + duration = parse_duration(self._search_regex( + r'([^<]+)<', + webpage, 'duration', fatal=False)) + categories = re.findall(r'([^<]+)', webpage) + + return { + '_type': 'url_transparent', + 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'categories': categories, + 'ie_key': 'ViewLiftEmbed', + } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 421795b94d..baa46d5f35 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -33,7 +33,6 @@ from ..utils import ( unified_timestamp, unsmuggle_url, urlencode_postdata, - urljoin, unescapeHTML, ) @@ -140,28 +139,28 @@ class VimeoBaseInfoExtractor(InfoExtractor): }) # TODO: fix handling of 308 status code returned for live archive manifest requests - sep_pattern = r'/sep/video/' for files_type in ('hls', 'dash'): for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items(): manifest_url = cdn_data.get('url') if not manifest_url: continue format_id = '%s-%s' % (files_type, cdn_name) - sep_manifest_urls = [] - if re.search(sep_pattern, manifest_url): - for suffix, repl in (('', 'video'), ('_sep', 'sep/video')): - sep_manifest_urls.append((format_id + suffix, re.sub( - sep_pattern, '/%s/' % repl, manifest_url))) - else: - sep_manifest_urls = [(format_id, manifest_url)] - for f_id, m_url in sep_manifest_urls: - if files_type == 'hls': - formats.extend(self._extract_m3u8_formats( - m_url, video_id, 'mp4', - 'm3u8' if is_live else 'm3u8_native', m3u8_id=f_id, - note='Downloading %s m3u8 information' % cdn_name, - fatal=False)) - elif files_type == 'dash': + if files_type == 'hls': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', m3u8_id=format_id, + note='Downloading %s m3u8 information' % cdn_name, + fatal=False)) + elif files_type == 'dash': + mpd_pattern = r'/%s/(?:sep/)?video/' % video_id + mpd_manifest_urls = [] + if re.search(mpd_pattern, manifest_url): + for suffix, repl in (('', 'video'), ('_sep', 'sep/video')): + mpd_manifest_urls.append((format_id + suffix, re.sub( + mpd_pattern, '/%s/%s/' % (video_id, repl), manifest_url))) + else: + mpd_manifest_urls = [(format_id, manifest_url)] + for f_id, m_url in mpd_manifest_urls: if 'json=1' in m_url: real_m_url = (self._download_json(m_url, video_id, fatal=False) or {}).get('url') if real_m_url: @@ -170,6 +169,11 @@ class VimeoBaseInfoExtractor(InfoExtractor): m_url.replace('/master.json', '/master.mpd'), video_id, f_id, 'Downloading %s MPD information' % cdn_name, fatal=False) + for f in mpd_formats: + if f.get('vcodec') == 'none': + f['preference'] = -50 + elif f.get('acodec') == 'none': + f['preference'] = -40 formats.extend(mpd_formats) live_archive = live_event.get('archive') or {} @@ -181,19 +185,13 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'preference': 1, }) - for f in formats: - if f.get('vcodec') == 'none': - f['preference'] = -50 - elif f.get('acodec') == 'none': - f['preference'] = -40 - subtitles = {} text_tracks = config['request'].get('text_tracks') if text_tracks: for tt in text_tracks: subtitles[tt['lang']] = [{ 'ext': 'vtt', - 'url': urljoin('https://vimeo.com', tt['url']), + 'url': 'https://vimeo.com' + tt['url'], }] thumbnails = [] @@ -593,7 +591,7 @@ class VimeoIE(VimeoBaseInfoExtractor): # Retrieve video webpage to extract further information webpage, urlh = self._download_webpage_handle( url, video_id, headers=headers) - redirect_url = urlh.geturl() + redirect_url = compat_str(urlh.geturl()) except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: errmsg = ee.cause.read() @@ -843,6 +841,33 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): return self._TITLE or self._html_search_regex( self._TITLE_RE, webpage, 'list title', fatal=False) + def _login_list_password(self, page_url, list_id, webpage): + login_form = self._search_regex( + r'(?s)]+?id="pw_form"(.*?)', + webpage, 'login form', default=None) + if not login_form: + return webpage + + password = self._downloader.params.get('videopassword') + if password is None: + raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True) + fields = self._hidden_inputs(login_form) + token, vuid = self._extract_xsrft_and_vuid(webpage) + fields['token'] = token + fields['password'] = password + post = urlencode_postdata(fields) + password_path = self._search_regex( + r'action="([^"]+)"', login_form, 'password URL') + password_url = compat_urlparse.urljoin(page_url, password_path) + password_request = sanitized_Request(password_url, post) + password_request.add_header('Content-type', 'application/x-www-form-urlencoded') + self._set_vimeo_cookie('vuid', vuid) + self._set_vimeo_cookie('xsrft', token) + + return self._download_webpage( + password_request, list_id, + 'Verifying the password', 'Wrong password') + def _title_and_entries(self, list_id, base_url): for pagenum in itertools.count(1): page_url = self._page_url(base_url, pagenum) @@ -851,6 +876,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): 'Downloading page %s' % pagenum) if pagenum == 1: + webpage = self._login_list_password(page_url, list_id, webpage) yield self._extract_list_title(webpage) # Try extracting href first since not all videos are available via @@ -897,7 +923,7 @@ class VimeoUserIE(VimeoChannelIE): _BASE_URL_TEMPL = 'https://vimeo.com/%s' -class VimeoAlbumIE(VimeoBaseInfoExtractor): +class VimeoAlbumIE(VimeoChannelIE): IE_NAME = 'vimeo:album' _VALID_URL = r'https://vimeo\.com/(?:album|showcase)/(?P\d+)(?:$|[?#]|/(?!video))' _TITLE_RE = r'