Merge a5ccc7f0aa into 12d8ea8246

[ie/youtube] Remove android from default clients (#9553 )
Closes #9554 Authored by: coletdjnz, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
2024-06-08 22:38:35 +02:00 · 2024-05-18 03:34:24 +05:30 · 2024-05-17 16:03:02 +00:00 · 2024-04-29 15:23:46 +02:00 · 2024-01-06 19:08:34 +01:00 · 2023-12-15 16:09:09 +01:00
5 changed files with 153 additions and 31 deletions
--- a/README.md
+++ b/README.md
@ -1760,7 +1760,7 @@ # EXTRACTOR ARGUMENTS
 #### youtube
 * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes
 * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively
-* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb`, `mweb_embedscreen`, `mediaconnect` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients.
+* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb`, `mweb_embedscreen` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. The `android` clients will always be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients.
 * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details
 * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp.
 * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -754,6 +754,7 @@
 )
 from .holodex import HolodexIE
 from .hotnewhiphop import HotNewHipHopIE
+from .hotmart import HotmartIE
 from .hotstar import (
    HotStarIE,
    HotStarPrefixIE,
--- a/yt_dlp/extractor/hotmart.py
+++ b/yt_dlp/extractor/hotmart.py
@ -0,0 +1,79 @@
+from .common import InfoExtractor
+from ..utils import (
+    get_element_by_id,
+    traverse_obj,
+    int_or_none,
+    url_or_none,
+)
+
+
+class HotmartIE(InfoExtractor):
+    _VALID_URL = r'https?://player\.hotmart\.com/embed/(?P<id>[a-zA-Z0-9]+)'
+    _TESTS = [
+        {
+            'url': (
+                'https://player.hotmart.com/embed/pRQKDWkKLB?signature=S0Pr1OaDwGvKwQ8i6Y9whykEo4uuok2P4AShiYcyarvFkQDT_rBlR5L1qdIbIferFBHfTVJlXcbgUAwMMPiV6sWaA0XIU4OO282MO092DX_Z8KqS1h0Y-452TMjAt3dW2ZYMKWtfA2A2sxM7JmpYZZdMKTrT7nwoPsfbythXfph3dCLzxNQ0gS-rHfD7SYWuKJGN1JmK6iAygJf1thpskoeOJyK04SpDwMoqIOYfsrUktvsJFlV3oWM1tVoeDIQPWSZGXE6WRWDPNmTz6h7IHvc-QKGzoRy3_CvzSEioq2SaDNDdloECrKH37V1eCNvdaIr0dQeHqH_vI0NMBsfCow==&token=aa2d356b-e2f0-45e8-9725-e0efc7b5d29c&autoplay=autoplay'
+            ),
+            'md5': '95d7a252bb97954663fcf6c6db4b4555',
+            'info_dict': {
+                'id': 'pRQKDWkKLB',
+                'video_id': 'pRQKDWkKLB',
+                'ext': 'mp4',
+                'title': 'Hotmart video #pRQKDWkKLB',
+                'thumbnail': (
+                    r're:https?://.*\.(?:jpg|jpeg|png|gif)\?token=exp=\d+~acl=.*~hmac=[a-f0-9]+$'
+                ),
+            },
+        }
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_data_string = get_element_by_id('__NEXT_DATA__', webpage)
+        video_data = self._parse_json(video_data_string, video_id, fatal=False)
+
+        title = self._html_search_meta(
+            ['og:title', 'title', 'twitter:title'],
+            webpage, 'title', default='Hotmart video #' + video_id
+        )
+
+        url = traverse_obj(
+            video_data,
+            (
+                'props',
+                'pageProps',
+                'applicationData',
+                'mediaAssets',
+                0,
+                'urlEncrypted',
+            ),
+            expected_type=url_or_none,
+        )
+        thumbnail_url = traverse_obj(
+            video_data,
+            ('props', 'pageProps', 'applicationData', 'thumbnailUrl'),
+            expected_type=url_or_none,
+        )
+
+        formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+            url, video_id, 'mp4', fatal=False
+        )
+
+        description = self._og_search_description(webpage, default=None)
+        chapter = None
+        chapter_number = None
+
+        return {
+            'id': video_id,
+            'video_id': video_id,
+            'thumbnail': thumbnail_url,
+            'formats': formats,
+            'subtitles': subtitles,
+            'title': title,
+            'description': description,
+            'chapter': chapter,
+            'chapter_number': int_or_none(chapter_number),
+        }
--- a/yt_dlp/extractor/teachable.py
+++ b/yt_dlp/extractor/teachable.py
@ -1,12 +1,14 @@
 import re

 from .common import InfoExtractor
-from .wistia import WistiaIE
+from .hotmart import HotmartIE
 from ..utils import (
    clean_html,
+    extract_attributes,
    ExtractorError,
-    int_or_none,
    get_element_by_class,
+    get_elements_html_by_class,
+    int_or_none,
    strip_or_none,
    urlencode_postdata,
    urljoin,
@ -111,15 +113,16 @@ class TeachableIE(TeachableBaseIE):
    _TESTS = [{
        'url': 'https://gns3.teachable.com/courses/gns3-certified-associate/lectures/6842364',
        'info_dict': {
-            'id': 'untlgzk1v7',
-            'ext': 'bin',
+            'id': 'Nq7vkXmXRA',
+            'video_id': 'Nq7vkXmXRA',
+            'ext': 'mp4',
            'title': 'Overview',
-            'description': 'md5:071463ff08b86c208811130ea1c2464c',
-            'duration': 736.4,
-            'timestamp': 1542315762,
-            'upload_date': '20181115',
            'chapter': 'Welcome',
            'chapter_number': 1,
+            'webpage_url': r're:https://player.hotmart.com/embed/Nq7vkXmXRA\?signature=.+&token=.+',
+            'width': 1920,
+            'height': 1080,
+            'thumbnail': r're:https?://.*\.(?:jpg|jpeg|webp)\?token=exp=\d+~acl=.*~hmac=[a-f0-9]+$',
        },
        'params': {
            'skip_download': True,
@ -127,6 +130,9 @@ class TeachableIE(TeachableBaseIE):
    }, {
        'url': 'http://v1.upskillcourses.com/courses/119763/lectures/1747100',
        'only_matching': True,
+    }, {
+        'url': 'http://v1.upskillcourses.com/courses/119763/lectures/1747100',
+        'only_matching': True,
    }, {
        'url': 'https://gns3.teachable.com/courses/423415/lectures/6885939',
        'only_matching': True,
@ -161,13 +167,35 @@ def _real_extract(self, url):

        webpage = self._download_webpage(url, video_id)

-        wistia_urls = WistiaIE._extract_embed_urls(url, webpage)
-        if not wistia_urls:
+        hotmart_container_elements = get_elements_html_by_class(
+            'hotmart_video_player', webpage
+        )
+        hotmart_urls = []
+        for hotmart_container_element in hotmart_container_elements:
+            hotmart_container_attributes = extract_attributes(hotmart_container_element)
+            attachment_id = hotmart_container_attributes['data-attachment-id']
+
+            hotmart_video_url_data = self._download_json(
+                f'https://{site}/api/v2/hotmart/private_video',
+                video_id,
+                query={'attachment_id': attachment_id},
+            )
+
+            hotmart_url = (
+                'https://player.hotmart.com/embed/'
+                f'{hotmart_video_url_data ["video_id"]}?'
+                f'signature={hotmart_video_url_data ["signature"]}&'
+                'token='
+                f'{hotmart_video_url_data ["teachable_application_key"]}'
+            )
+
+            hotmart_urls.append(hotmart_url)
+
+        if not hotmart_urls:
            if any(re.search(p, webpage) for p in (
                    r'class=["\']lecture-contents-locked',
                    r'>\s*Lecture contents locked',
                    r'id=["\']lecture-locked',
-                    # https://academy.tailoredtutors.co.uk/courses/108779/lectures/1955313
                    r'class=["\'](?:inner-)?lesson-locked',
                    r'>LESSON LOCKED<')):
                self.raise_login_required('Lecture contents locked')
@ -196,14 +224,16 @@ def _real_extract(self, url):
                if chapter_number <= len(sections):
                    chapter = sections[chapter_number - 1]

-        entries = [{
-            '_type': 'url_transparent',
-            'url': wistia_url,
-            'ie_key': WistiaIE.ie_key(),
-            'title': title,
-            'chapter': chapter,
-            'chapter_number': chapter_number,
-        } for wistia_url in wistia_urls]
+        entries = []
+        for hotmart_url in hotmart_urls:
+            entries.append({
+                '_type': 'url_transparent',
+                'url': hotmart_url,
+                'ie_key': HotmartIE.ie_key(),
+                'title': title,
+                'chapter': chapter,
+                'chapter_number': chapter_number,
+            })

        return self.playlist_result(entries, video_id, title)

--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -2353,6 +2353,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'format': '17',  # 3gp format available on android
                'extractor_args': {'youtube': {'player_client': ['android']}},
            },
+            'skip': 'android client broken',
        },
        {
            # Skip download of additional client configs (remix client config in this case)
@ -2730,7 +2731,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'heatmap': 'count:100',
            },
            'params': {
-                'extractor_args': {'youtube': {'player_client': ['android'], 'player_skip': ['webpage']}},
+                'extractor_args': {'youtube': {'player_client': ['ios'], 'player_skip': ['webpage']}},
            },
        },
    ]
@ -3662,8 +3663,6 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg,
        yt_query = {
            'videoId': video_id,
        }
-        if _split_innertube_client(client)[0] in ('android', 'android_embedscreen'):
-            yt_query['params'] = 'CgIIAQ=='

        pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0]
        if pp_arg:
@ -3679,19 +3678,24 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg,

    def _get_requested_clients(self, url, smuggled_data):
        requested_clients = []
-        default = ['ios', 'android', 'web']
+        android_clients = []
+        default = ['ios', 'web']
        allowed_clients = sorted(
            (client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'),
            key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True)
        for client in self._configuration_arg('player_client'):
-            if client in allowed_clients:
-                requested_clients.append(client)
-            elif client == 'default':
+            if client == 'default':
                requested_clients.extend(default)
            elif client == 'all':
                requested_clients.extend(allowed_clients)
-            else:
+            elif client not in allowed_clients:
                self.report_warning(f'Skipping unsupported client {client}')
+            elif client.startswith('android'):
+                android_clients.append(client)
+            else:
+                requested_clients.append(client)
+        # Force deprioritization of broken Android clients for format de-duplication
+        requested_clients.extend(android_clients)
        if not requested_clients:
            requested_clients = default

@ -3910,6 +3914,14 @@ def build_fragments(f):
                    f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True)

            client_name = fmt.get(STREAMING_DATA_CLIENT_NAME)
+            # Android client formats are broken due to integrity check enforcement
+            # Ref: https://github.com/yt-dlp/yt-dlp/issues/9554
+            is_broken = client_name and client_name.startswith(short_client_name('android'))
+            if is_broken:
+                self.report_warning(
+                    f'{video_id}: Android client formats are broken and may yield HTTP Error 403. '
+                    'They will be deprioritized', only_once=True)
+
            name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or ''
            fps = int_or_none(fmt.get('fps')) or 0
            dct = {
@ -3922,7 +3934,7 @@ def build_fragments(f):
                    name, fmt.get('isDrc') and 'DRC',
                    try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()),
                    try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()),
-                    throttled and 'THROTTLED', is_damaged and 'DAMAGED',
+                    throttled and 'THROTTLED', is_damaged and 'DAMAGED', is_broken and 'BROKEN',
                    (self.get_param('verbose') or all_formats) and client_name,
                    delim=', '),
                # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372
@ -3940,8 +3952,8 @@ def build_fragments(f):
                'language': join_nonempty(audio_track.get('id', '').split('.')[0],
                                          'desc' if language_preference < -1 else '') or None,
                'language_preference': language_preference,
-                # Strictly de-prioritize damaged and 3gp formats
-                'preference': -10 if is_damaged else -2 if itag == '17' else None,
+                # Strictly de-prioritize broken, damaged and 3gp formats
+                'preference': -20 if is_broken else -10 if is_damaged else -2 if itag == '17' else None,
            }
            mime_mobj = re.match(
                r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')
Author	SHA1	Message	Date
Abdessamad Derraz	9fee9e2b74	Merge `a5ccc7f0aa` into `12d8ea8246`	2024-05-18 03:34:24 +05:30
coletdjnz	12d8ea8246	[ie/youtube] Remove `android` from default clients (#9553 ) Closes #9554 Authored by: coletdjnz, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>	2024-05-17 16:03:02 +00:00
Abdessamad Derraz	a5ccc7f0aa	Merge branch 'yt-dlp:master' into teachable-fix-add-hotmart	2024-04-29 15:23:46 +02:00
Abdessamad Derraz	7d87c09c5d	Merge branch 'yt-dlp:master' into teachable-fix-add-hotmart	2024-01-06 19:08:34 +01:00
Abdessamad Derraz	8a0f2ffd80	Merge branch 'yt-dlp:master' into teachable-fix-add-hotmart	2023-12-15 16:09:09 +01:00
Abdessamad DERRAZ	e61606eaf2	[ie/teachable] Remove Wistia support	2023-10-24 21:25:57 +02:00
Abdessamad Derraz	931a90e7da	Merge branch 'yt-dlp:master' into teachable-fix-add-hotmart	2023-10-24 20:45:43 +02:00
Abdessamad Derraz	fddf9a60ee	Revert "Refactor HotmartIE tests to generate dynamic URL" This reverts commit `bee20eeb82`.	2023-07-22 07:40:04 +02:00
Abdessamad Derraz	182a18eaa0	Revert "Reorder imports in hotmart.py" This reverts commit `e7f90d887d`.	2023-07-22 07:39:53 +02:00
Abdessamad DERRAZ	e7f90d887d	Reorder imports in hotmart.py This commit reorders the imports in hotmart.py according to PEP8 guidelines for improved readability.	2023-07-22 02:13:09 +02:00
Abdessamad DERRAZ	bee20eeb82	Refactor HotmartIE tests to generate dynamic URL This commit refactors the tests in the HotmartIE class to generate a dynamic URL for testing. Previously, the test URL was hardcoded and could expire, causing the tests to fail. Now, the test URL is generated dynamically by making a request to the Teachable API and constructing the URL from the response. This ensures that the test URL is always valid at the time of testing.	2023-07-22 02:04:45 +02:00
Abdessamad DERRAZ	c1b5121932	Update Teachable Extractor to Support Multiple Videos per Page This commit updates the Teachable extractor function to support the extraction of multiple videos from a single page. Previously, the function only extracted the first video from a page. The updated function now iterates over all video elements on a page and extracts each one. This enhancement improves the functionality of the extractor and allows for more comprehensive scraping of Teachable content.	2023-07-22 00:31:20 +02:00
Abdessamad DERRAZ	7f44d0fa02	Refactoring HotmartIE Extractor for Code Convention Compliance Implemented code convention compliance throughout the HotmartIE extractor. The adjustments span from syntax and structure to proper usage of utility functions. The changes encompass quote usage, inline value extraction, long lines management, import order, fallbacks collapsing, trailing parentheses placement, and the use of conversion and parsing functions like url_or_none and int_or_none for safer data handling.	2023-07-21 19:29:19 +02:00
Abdessamad DERRAZ	19cc97d1a1	Apply minor style corrections This commit introduces a few minor style corrections that were previously overlooked. These corrections ensure that the code adheres to the project's style guidelines and improves overall readability. The changes are minor and do not affect the functionality of the code.	2023-07-21 11:43:27 +02:00
Abdessamad DERRAZ	f5fc4de7c9	Revert unnecessary style changes caused by Black This commit reverts the unnecessary style changes that were introduced by the Black tool. While Black is useful for enforcing PEP 8 style guidelines, it had caused too many modifications in this case, leading to a loss of original code style and readability. This commit ensures that only the necessary style changes for PEP 8 compliance are kept, while the rest of the code remains as originally intended.	2023-07-21 11:35:35 +02:00
Abdessamad DERRAZ	a997e972a5	Fix flake8 E203 errors in teachable.py This commit resolves the flake8 E203 errors in the teachable.py file. The errors were caused by unnecessary whitespace before colons on lines 210 and 386. The removal of these spaces ensures the code adheres to PEP 8 style guidelines, allowing flake8 to run without any errors.	2023-07-20 23:19:07 +02:00
Abdessamad DERRAZ	0dd98c888d	Fix tests for teachable.py and hotmart.py This commit addresses the failing tests in teachable.py and hotmart.py. The changes ensure that the tests pass by correctly handling the temporary URLs generated by these scripts. Please note that the tests are dependent on the validity of these temporary URLs. Therefore, the tests will pass temporarily as long as the URLs are valid. In addition, this commit adheres to the style guide by wrapping the code at 79 characters.	2023-07-20 23:14:06 +02:00
Abdessamad DERRAZ	378ecc6b65	Update Test URL in HotmartIE The test URL in the HotmartIE class has been updated to match the format expected by _VALID_URL. This resolves a test error where the test URL did not match the expected format.	2023-07-20 20:06:48 +02:00
Abdessamad DERRAZ	fe8d9ec837	[extractor] Update Teachable and add Hotmart (Credit: Green0Photon) This commit updates the Teachable extractor to handle Hotmart video URLs and adds a new extractor for Hotmart. The Hotmart extractor is used to handle videos from Hotmart, a platform for selling online courses. The changes allow yt-dlp to download videos from courses hosted on Hotmart. The code for these changes was originally written by Green0Photon and can be found at https://github.com/Green0Photon/yt-dlp/tree/teachable-fix-add-hotmart. This commit is a direct implementation of his work into the main yt-dlp repository. The changes have been tested and confirmed to work as expected. This commit also includes updates to the code style and formatting to match the yt-dlp contributing guidelines.	2023-07-20 19:31:22 +02:00