Merge 172dfbeaed into 12d8ea8246

[ie/youtube] Remove android from default clients (#9553 )
Closes #9554 Authored by: coletdjnz, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
2024-06-08 14:28:27 +02:00 · 2024-05-17 21:58:17 +02:00 · 2024-05-17 16:03:02 +00:00 · 2024-05-17 14:37:30 +00:00 · 2024-05-17 14:33:12 +00:00 · 2024-05-17 14:28:36 +00:00
10 changed files with 208 additions and 62 deletions
--- a/README.md
+++ b/README.md
@ -666,7 +666,7 @@ ## Filesystem Options:
                                    The name of the browser to load cookies
                                    from. Currently supported browsers are:
                                    brave, chrome, chromium, edge, firefox,
-                                    opera, safari, vivaldi. Optionally, the
+                                    opera, safari, vivaldi, whale. Optionally, the
                                    KEYRING used for decrypting Chromium cookies
                                    on Linux, the name/path of the PROFILE to
                                    load cookies from, and the CONTAINER name
@ -1760,7 +1760,7 @@ # EXTRACTOR ARGUMENTS
 #### youtube
 * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes
 * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively
-* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb`, `mweb_embedscreen`, `mediaconnect` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients.
+* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb`, `mweb_embedscreen` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. The `android` clients will always be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients.
 * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details
 * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp.
 * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -411,10 +411,15 @@ def test_unified_timestamps(self):
        self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540)
        self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140)
        self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363)
+        self.assertEqual(unified_timestamp('2022-10-13T02:37:47.831Z'), 1665628667)

        self.assertEqual(unified_timestamp('December 31 1969 20:00:01 EDT'), 1)
        self.assertEqual(unified_timestamp('Wednesday 31 December 1969 18:01:26 MDT'), 86)
        self.assertEqual(unified_timestamp('12/31/1969 20:01:18 EDT', False), 78)
+        self.assertEqual(unified_timestamp('2023-03-09T18:01:33.646Z', with_milliseconds=True), 1678384893.646)
+        # ISO8601 spec says that if no timezone is specified, we should use local timezone;
+        # but yt-dlp uses UTC to keep things consistent
+        self.assertEqual(unified_timestamp('2023-03-11T06:48:34.008'), 1678517314)

    def test_determine_ext(self):
        self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4')
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -27,7 +27,12 @@
 from .compat import functools, urllib  # isort: split
 from .compat import compat_os_name, urllib_req_to_req
 from .cookies import LenientSimpleCookie, load_cookies
-from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
+from .downloader import (
+    DashSegmentsFD,
+    FFmpegFD,
+    get_suitable_downloader,
+    shorten_protocol_name,
+)
 from .downloader.rtmp import rtmpdump_version
 from .extractor import gen_extractor_classes, get_info_extractor
 from .extractor.common import UnsupportedURLIE
@ -3353,7 +3358,7 @@ def existing_video_file(*filepaths):
                fd, success = None, True
                if info_dict.get('protocol') or info_dict.get('url'):
                    fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
-                    if fd != FFmpegFD and 'no-direct-merge' not in self.params['compat_opts'] and (
+                    if fd not in [FFmpegFD, DashSegmentsFD] and 'no-direct-merge' not in self.params['compat_opts'] and (
                            info_dict.get('section_start') or info_dict.get('section_end')):
                        msg = ('This format cannot be partially downloaded' if FFmpegFD.available()
                               else 'You have requested downloading the video partially, but ffmpeg is not installed')
--- a/yt_dlp/init.py
+++ b/yt_dlp/init.py
@ -12,6 +12,7 @@
 import optparse
 import os
 import re
+import time
 import traceback

 from .compat import compat_os_name, compat_shlex_quote
@ -331,12 +332,13 @@ def parse_chapters(name, value, advanced=False):
            (?P<end_sign>-?)(?P<end>[^-]+)
        )?'''

+        current_time = time.time()
        chapters, ranges, from_url = [], [], False
        for regex in value or []:
            if advanced and regex == '*from-url':
                from_url = True
                continue
-            elif not regex.startswith('*'):
+            elif not regex.startswith('*') and not regex.startswith('#'):
                try:
                    chapters.append(re.compile(regex))
                except re.error as err:
@ -353,11 +355,16 @@ def parse_chapters(name, value, advanced=False):
                    err = 'Must be of the form "*start-end"'
                elif not advanced and any(signs):
                    err = 'Negative timestamps are not allowed'
-                else:
+                elif regex.startswith('*'):
                    dur[0] *= -1 if signs[0] else 1
                    dur[1] *= -1 if signs[1] else 1
                    if dur[1] == float('-inf'):
                        err = '"-inf" is not a valid end'
+                elif regex.startswith('#'):
+                    dur[0] = dur[0] * (-1 if signs[0] else 1) + current_time
+                    dur[1] = dur[1] * (-1 if signs[1] else 1) + current_time
+                    if dur[1] == float('-inf'):
+                        err = '"-inf" is not a valid end'
                if err:
                    raise ValueError(f'invalid {name} time range "{regex}". {err}')
                ranges.append(dur)
--- a/yt_dlp/cookies.py
+++ b/yt_dlp/cookies.py
@ -46,7 +46,7 @@
 from .utils._utils import _YDLLogger
 from .utils.networking import normalize_url

-CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'}
+CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'whale'}
 SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'}


@ -219,6 +219,7 @@ def _get_chromium_based_browser_settings(browser_name):
            'edge': os.path.join(appdata_local, R'Microsoft\Edge\User Data'),
            'opera': os.path.join(appdata_roaming, R'Opera Software\Opera Stable'),
            'vivaldi': os.path.join(appdata_local, R'Vivaldi\User Data'),
+            'whale': os.path.join(appdata_local, R'Naver\Naver Whale\User Data'),
        }[browser_name]

    elif sys.platform == 'darwin':
@ -230,6 +231,7 @@ def _get_chromium_based_browser_settings(browser_name):
            'edge': os.path.join(appdata, 'Microsoft Edge'),
            'opera': os.path.join(appdata, 'com.operasoftware.Opera'),
            'vivaldi': os.path.join(appdata, 'Vivaldi'),
+            'whale': os.path.join(appdata, 'Naver/Whale'),
        }[browser_name]

    else:
@ -241,6 +243,7 @@ def _get_chromium_based_browser_settings(browser_name):
            'edge': os.path.join(config, 'microsoft-edge'),
            'opera': os.path.join(config, 'opera'),
            'vivaldi': os.path.join(config, 'vivaldi'),
+            'whale': os.path.join(config, 'naver-whale'),
        }[browser_name]

    # Linux keyring names can be determined by snooping on dbus while opening the browser in KDE:
@ -252,6 +255,7 @@ def _get_chromium_based_browser_settings(browser_name):
        'edge': 'Microsoft Edge' if sys.platform == 'darwin' else 'Chromium',
        'opera': 'Opera' if sys.platform == 'darwin' else 'Chromium',
        'vivaldi': 'Vivaldi' if sys.platform == 'darwin' else 'Chrome',
+        'whale': 'Whale',
    }[browser_name]

    browsers_without_profiles = {'opera'}
--- a/yt_dlp/downloader/dash.py
+++ b/yt_dlp/downloader/dash.py
@ -36,6 +36,8 @@ def real_download(self, filename, info_dict):
                'filename': fmt.get('filepath') or filename,
                'live': 'is_from_start' if fmt.get('is_from_start') else fmt.get('is_live'),
                'total_frags': fragment_count,
+                'section_start': info_dict.get('section_start'),
+                'section_end': info_dict.get('section_end'),
            }

            if real_downloader:
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -957,7 +957,8 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=
        if urlh is False:
            assert not fatal
            return False
-        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
+        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
+                                             encoding=encoding, data=data)
        return (content, urlh)

    @staticmethod
@ -1005,8 +1006,10 @@ def __check_blocked(self, content):
                'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
                expected=True)

-    def _request_dump_filename(self, url, video_id):
-        basen = f'{video_id}_{url}'
+    def _request_dump_filename(self, url, video_id, data=None):
+        if data is not None:
+            data = hashlib.md5(data).hexdigest()
+        basen = join_nonempty(video_id, data, url, delim='_')
        trim_length = self.get_param('trim_file_name') or 240
        if len(basen) > trim_length:
            h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
@ -1028,16 +1031,18 @@ def __decode_webpage(self, webpage_bytes, encoding, headers):
        except LookupError:
            return webpage_bytes.decode('utf-8', 'replace')

-    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
+    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
+                              prefix=None, encoding=None, data=None):
        webpage_bytes = urlh.read()
        if prefix is not None:
            webpage_bytes = prefix + webpage_bytes
+        url_or_request = self._create_request(url_or_request, data)
        if self.get_param('dump_intermediate_pages', False):
            self.to_screen('Dumping request to ' + urlh.url)
            dump = base64.b64encode(webpage_bytes).decode('ascii')
            self._downloader.to_screen(dump)
        if self.get_param('write_pages'):
-            filename = self._request_dump_filename(urlh.url, video_id)
+            filename = self._request_dump_filename(urlh.url, video_id, url_or_request.data)
            self.to_screen(f'Saving request to {filename}')
            with open(filename, 'wb') as outf:
                outf.write(webpage_bytes)
@ -1098,7 +1103,7 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote,
                             impersonate=None, require_impersonation=False):
            if self.get_param('load_pages'):
                url_or_request = self._create_request(url_or_request, data, headers, query)
-                filename = self._request_dump_filename(url_or_request.url, video_id)
+                filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
                self.to_screen(f'Loading request from {filename}')
                try:
                    with open(filename, 'rb') as dumpf:
@ -2695,7 +2700,7 @@ def extract_common(source):
                            r = int(s.get('r', 0))
                            ms_info['total_number'] += 1 + r
                            ms_info['s'].append({
-                                't': int(s.get('t', 0)),
+                                't': int_or_none(s.get('t')),
                                # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
                                'd': int(s.attrib['d']),
                                'r': r,
@ -2737,8 +2742,14 @@ def extract_Initialization(source):
            return ms_info

        mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
+        availability_start_time = unified_timestamp(
+            mpd_doc.get('availabilityStartTime'), with_milliseconds=True) or 0
        stream_numbers = collections.defaultdict(int)
        for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
+            # segmentIngestTime is completely out of spec, but YT Livestream do this
+            segment_ingest_time = period.get('{http://youtube.com/yt/2012/10/10}segmentIngestTime')
+            if segment_ingest_time:
+                availability_start_time = unified_timestamp(segment_ingest_time, with_milliseconds=True)
            period_entry = {
                'id': period.get('id', f'period-{period_idx}'),
                'formats': [],
@ -2917,13 +2928,17 @@ def add_segment_url():
                                    'Bandwidth': bandwidth,
                                    'Number': segment_number,
                                }
+                                duration = float_or_none(segment_d, representation_ms_info['timescale'])
+                                start = float_or_none(segment_time, representation_ms_info['timescale'])
                                representation_ms_info['fragments'].append({
                                    media_location_key: segment_url,
-                                    'duration': float_or_none(segment_d, representation_ms_info['timescale']),
+                                    'duration': duration,
+                                    'start': availability_start_time + start,
+                                    'end': availability_start_time + start + duration,
                                })

                            for num, s in enumerate(representation_ms_info['s']):
-                                segment_time = s.get('t') or segment_time
+                                segment_time = s['t'] if s.get('t') is not None else segment_time
                                segment_d = s['d']
                                add_segment_url()
                                segment_number += 1
@ -2939,6 +2954,7 @@ def add_segment_url():
                        fragments = []
                        segment_index = 0
                        timescale = representation_ms_info['timescale']
+                        start = 0
                        for s in representation_ms_info['s']:
                            duration = float_or_none(s['d'], timescale)
                            for r in range(s.get('r', 0) + 1):
@ -2946,8 +2962,11 @@ def add_segment_url():
                                fragments.append({
                                    location_key(segment_uri): segment_uri,
                                    'duration': duration,
+                                    'start': availability_start_time + start,
+                                    'end': availability_start_time + start + duration,
                                })
                                segment_index += 1
+                                start += duration
                        representation_ms_info['fragments'] = fragments
                    elif 'segment_urls' in representation_ms_info:
                        # Segment URLs with no SegmentTimeline
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -2353,6 +2353,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'format': '17',  # 3gp format available on android
                'extractor_args': {'youtube': {'player_client': ['android']}},
            },
+            'skip': 'android client broken',
        },
        {
            # Skip download of additional client configs (remix client config in this case)
@ -2730,7 +2731,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'heatmap': 'count:100',
            },
            'params': {
-                'extractor_args': {'youtube': {'player_client': ['android'], 'player_skip': ['webpage']}},
+                'extractor_args': {'youtube': {'player_client': ['ios'], 'player_skip': ['webpage']}},
            },
        },
    ]
@ -2801,17 +2802,17 @@ def refetch_manifest(format_id, delay):
            microformats = traverse_obj(
                prs, (..., 'microformat', 'playerMicroformatRenderer'),
                expected_type=dict)
-            _, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url)
-            is_live = live_status == 'is_live'
-            start_time = time.time()
+            with lock:
+                _, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url)
+                is_live = live_status == 'is_live'
+                start_time = time.time()

        def mpd_feed(format_id, delay):
            """
            @returns (manifest_url, manifest_stream_number, is_live) or None
            """
            for retry in self.RetryManager(fatal=False):
-                with lock:
-                    refetch_manifest(format_id, delay)
+                refetch_manifest(format_id, delay)

                f = next((f for f in formats if f['format_id'] == format_id), None)
                if not f:
@ -2842,6 +2843,11 @@ def _live_dash_fragments(self, video_id, format_id, live_start_time, mpd_feed, m
        begin_index = 0
        download_start_time = ctx.get('start') or time.time()

+        section_start = ctx.get('section_start') or 0
+        section_end = ctx.get('section_end') or math.inf
+
+        self.write_debug(f'Selected section: {section_start} -> {section_end}')
+
        lack_early_segments = download_start_time - (live_start_time or download_start_time) > MAX_DURATION
        if lack_early_segments:
            self.report_warning(bug_reports_message(
@ -2862,9 +2868,10 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate):
                                               or (mpd_url, stream_number, False))
            if not refresh_sequence:
                if expire_fast and not is_live:
-                    return False, last_seq
+                    return False
                elif old_mpd_url == mpd_url:
-                    return True, last_seq
+                    return True
+
            if manifestless_orig_fmt:
                fmt_info = manifestless_orig_fmt
            else:
@ -2875,14 +2882,13 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate):
                    fmts = None
                if not fmts:
                    no_fragment_score += 2
-                    return False, last_seq
+                    return False
                fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number)
            fragments = fmt_info['fragments']
            fragment_base_url = fmt_info['fragment_base_url']
            assert fragment_base_url

-            _last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1))
-            return True, _last_seq
+            return True

        self.write_debug(f'[{video_id}] Generating fragments for format {format_id}')
        while is_live:
@ -2902,11 +2908,19 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate):
                    last_segment_url = None
                    continue
            else:
-                should_continue, last_seq = _extract_sequence_from_mpd(True, no_fragment_score > 15)
+                should_continue = _extract_sequence_from_mpd(True, no_fragment_score > 15)
                no_fragment_score += 2
                if not should_continue:
                    continue

+            last_fragment = fragments[-1]
+            last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1))
+
+            known_fragment = next(
+                (fragment for fragment in fragments if f'sq/{known_idx}' in fragment['path']), None)
+            if known_fragment and known_fragment['end'] > section_end:
+                break
+
            if known_idx > last_seq:
                last_segment_url = None
                continue
@ -2916,20 +2930,36 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate):
            if begin_index < 0 and known_idx < 0:
                # skip from the start when it's negative value
                known_idx = last_seq + begin_index
+
            if lack_early_segments:
-                known_idx = max(known_idx, last_seq - int(MAX_DURATION // fragments[-1]['duration']))
+                known_idx = max(known_idx, last_seq - int(MAX_DURATION // last_fragment['duration']))
+
+            fragment_count = last_seq - known_idx if section_end == math.inf else int(
+                (section_end - section_start) // last_fragment['duration'])
+
            try:
                for idx in range(known_idx, last_seq):
                    # do not update sequence here or you'll get skipped some part of it
-                    should_continue, _ = _extract_sequence_from_mpd(False, False)
+                    should_continue = _extract_sequence_from_mpd(False, False)
                    if not should_continue:
                        known_idx = idx - 1
                        raise ExtractorError('breaking out of outer loop')
-                    last_segment_url = urljoin(fragment_base_url, 'sq/%d' % idx)
-                    yield {
-                        'url': last_segment_url,
-                        'fragment_count': last_seq,
-                    }
+
+                    frag_duration = last_fragment['duration']
+                    frag_start = last_fragment['start'] - (last_seq - idx) * frag_duration
+                    frag_end = frag_start + frag_duration
+
+                    if frag_start >= section_start and frag_end <= section_end:
+                        last_segment_url = urljoin(fragment_base_url, f'sq/{idx}')
+
+                        yield {
+                            'url': last_segment_url,
+                            'fragment_count': fragment_count,
+                            'duration': frag_duration,
+                            'start': frag_start,
+                            'end': frag_end,
+                        }
+
                if known_idx == last_seq:
                    no_fragment_score += 5
                else:
@ -3317,7 +3347,36 @@ def _extract_heatmap(self, data):
                'value': ('intensityScoreNormalized', {float_or_none}),
            })) or None

-    def _extract_comment(self, comment_renderer, parent=None):
+    def _extract_comment(self, entities, parent=None):
+        comment_entity_payload = get_first(entities, ('payload', 'commentEntityPayload', {dict}))
+        if not (comment_id := traverse_obj(comment_entity_payload, ('properties', 'commentId', {str}))):
+            return
+
+        toolbar_entity_payload = get_first(entities, ('payload', 'engagementToolbarStateEntityPayload', {dict}))
+        time_text = traverse_obj(comment_entity_payload, ('properties', 'publishedTime', {str})) or ''
+
+        return {
+            'id': comment_id,
+            'parent': parent or 'root',
+            **traverse_obj(comment_entity_payload, {
+                'text': ('properties', 'content', 'content', {str}),
+                'like_count': ('toolbar', 'likeCountA11y', {parse_count}),
+                'author_id': ('author', 'channelId', {self.ucid_or_none}),
+                'author': ('author', 'displayName', {str}),
+                'author_thumbnail': ('author', 'avatarThumbnailUrl', {url_or_none}),
+                'author_is_uploader': ('author', 'isCreator', {bool}),
+                'author_is_verified': ('author', 'isVerified', {bool}),
+                'author_url': ('author', 'channelCommand', 'innertubeCommand', (
+                    ('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url')
+                ), {lambda x: urljoin('https://www.youtube.com', x)}),
+            }, get_all=False),
+            'is_favorited': (None if toolbar_entity_payload is None else
+                             toolbar_entity_payload.get('heartState') == 'TOOLBAR_HEART_STATE_HEARTED'),
+            '_time_text': time_text,  # FIXME: non-standard, but we need a way of showing that it is an estimate.
+            'timestamp': self._parse_time_text(time_text),
+        }
+
+    def _extract_comment_old(self, comment_renderer, parent=None):
        comment_id = comment_renderer.get('commentId')
        if not comment_id:
            return
@ -3398,21 +3457,39 @@ def extract_header(contents):
                break
            return _continuation

-        def extract_thread(contents):
+        def extract_thread(contents, entity_payloads):
            if not parent:
                tracker['current_page_thread'] = 0
            for content in contents:
                if not parent and tracker['total_parent_comments'] >= max_parents:
                    yield
                comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
-                comment_renderer = get_first(
-                    (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
-                    expected_type=dict, default={})

-                comment = self._extract_comment(comment_renderer, parent)
+                # old comment format
+                if not entity_payloads:
+                    comment_renderer = get_first(
+                        (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
+                        expected_type=dict, default={})
+
+                    comment = self._extract_comment_old(comment_renderer, parent)
+
+                # new comment format
+                else:
+                    view_model = (
+                        traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel', {dict}))
+                        or traverse_obj(content, ('commentViewModel', {dict})))
+                    comment_keys = traverse_obj(view_model, (('commentKey', 'toolbarStateKey'), {str}))
+                    if not comment_keys:
+                        continue
+                    entities = traverse_obj(entity_payloads, lambda _, v: v['entityKey'] in comment_keys)
+                    comment = self._extract_comment(entities, parent)
+                    if comment:
+                        comment['is_pinned'] = traverse_obj(view_model, ('pinnedText', {str})) is not None
+
                if not comment:
                    continue
                comment_id = comment['id']
+
                if comment.get('is_pinned'):
                    tracker['pinned_comment_ids'].add(comment_id)
                # Sometimes YouTube may break and give us infinite looping comments.
@ -3505,7 +3582,7 @@ def extract_thread(contents):
            check_get_keys = None
            if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0):
                check_get_keys = [[*continuation_items_path, ..., (
-                    'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]]
+                    'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentViewModel', 'commentRenderer'))]]
            try:
                response = self._extract_response(
                    item_id=None, query=continuation,
@ -3529,6 +3606,7 @@ def extract_thread(contents):
                raise
            is_forced_continuation = False
            continuation = None
+            mutations = traverse_obj(response, ('frameworkUpdates', 'entityBatchUpdate', 'mutations', ..., {dict}))
            for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]):
                if is_first_continuation:
                    continuation = extract_header(continuation_items)
@ -3537,7 +3615,7 @@ def extract_thread(contents):
                        break
                    continue

-                for entry in extract_thread(continuation_items):
+                for entry in extract_thread(continuation_items, mutations):
                    if not entry:
                        return
                    yield entry
@ -3614,8 +3692,6 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg,
        yt_query = {
            'videoId': video_id,
        }
-        if _split_innertube_client(client)[0] in ('android', 'android_embedscreen'):
-            yt_query['params'] = 'CgIIAQ=='

        pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0]
        if pp_arg:
@ -3631,19 +3707,24 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg,

    def _get_requested_clients(self, url, smuggled_data):
        requested_clients = []
-        default = ['ios', 'android', 'web']
+        android_clients = []
+        default = ['ios', 'web']
        allowed_clients = sorted(
            (client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'),
            key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True)
        for client in self._configuration_arg('player_client'):
-            if client in allowed_clients:
-                requested_clients.append(client)
-            elif client == 'default':
+            if client == 'default':
                requested_clients.extend(default)
            elif client == 'all':
                requested_clients.extend(allowed_clients)
-            else:
+            elif client not in allowed_clients:
                self.report_warning(f'Skipping unsupported client {client}')
+            elif client.startswith('android'):
+                android_clients.append(client)
+            else:
+                requested_clients.append(client)
+        # Force deprioritization of broken Android clients for format de-duplication
+        requested_clients.extend(android_clients)
        if not requested_clients:
            requested_clients = default

@ -3862,6 +3943,14 @@ def build_fragments(f):
                    f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True)

            client_name = fmt.get(STREAMING_DATA_CLIENT_NAME)
+            # Android client formats are broken due to integrity check enforcement
+            # Ref: https://github.com/yt-dlp/yt-dlp/issues/9554
+            is_broken = client_name and client_name.startswith(short_client_name('android'))
+            if is_broken:
+                self.report_warning(
+                    f'{video_id}: Android client formats are broken and may yield HTTP Error 403. '
+                    'They will be deprioritized', only_once=True)
+
            name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or ''
            fps = int_or_none(fmt.get('fps')) or 0
            dct = {
@ -3874,7 +3963,7 @@ def build_fragments(f):
                    name, fmt.get('isDrc') and 'DRC',
                    try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()),
                    try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()),
-                    throttled and 'THROTTLED', is_damaged and 'DAMAGED',
+                    throttled and 'THROTTLED', is_damaged and 'DAMAGED', is_broken and 'BROKEN',
                    (self.get_param('verbose') or all_formats) and client_name,
                    delim=', '),
                # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372
@ -3892,8 +3981,8 @@ def build_fragments(f):
                'language': join_nonempty(audio_track.get('id', '').split('.')[0],
                                          'desc' if language_preference < -1 else '') or None,
                'language_preference': language_preference,
-                # Strictly de-prioritize damaged and 3gp formats
-                'preference': -10 if is_damaged else -2 if itag == '17' else None,
+                # Strictly de-prioritize broken, damaged and 3gp formats
+                'preference': -20 if is_broken else -10 if is_damaged else -2 if itag == '17' else None,
            }
            mime_mobj = re.match(
                r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')
@ -3918,6 +4007,9 @@ def build_fragments(f):
                dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE}
                yield dct

+        if live_status == 'is_live' and self.get_param('download_ranges') and not self.get_param('live_from_start'):
+            self.report_warning('For YT livestreams, --download-sections is only supported with --live-from-start')
+
        needs_live_processing = self._needs_live_processing(live_status, duration)
        skip_bad_formats = 'incomplete' not in format_types
        if self._configuration_arg('include_incomplete_formats'):
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@ -419,7 +419,14 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs):
    general.add_option(
        '--live-from-start',
        action='store_true', dest='live_from_start',
-        help='Download livestreams from the start. Currently only supported for YouTube (Experimental)')
+        help=('Download livestreams from the start. Currently only supported for YouTube (Experimental). '
+              'Time ranges can be specified using --download-sections to download only a part of the stream. '
+              'Negative values are allowed for specifying a relative previous time, using the # syntax '
+              'e.g. --download-sections "#-24hours - 0" (download last 24 hours), '
+              'e.g. --download-sections "#-1h - 30m" (download from 1 hour ago until the next 30 minutes), '
+              'e.g. --download-sections "#-3days - -2days" (download from 3 days ago until 2 days ago). '
+              'It is also possible to specify an exact unix timestamp range, using the * syntax, '
+              'e.g. --download-sections "*1672531200 - 1672549200" (download between those two timestamps)'))
    general.add_option(
        '--no-live-from-start',
        action='store_false', dest='live_from_start',
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@ -1209,7 +1209,7 @@ def unified_strdate(date_str, day_first=True):
        return str(upload_date)


-def unified_timestamp(date_str, day_first=True):
+def unified_timestamp(date_str, day_first=True, with_milliseconds=False):
    if not isinstance(date_str, str):
        return None

@ -1235,7 +1235,7 @@ def unified_timestamp(date_str, day_first=True):
    for expression in date_formats(day_first):
        with contextlib.suppress(ValueError):
            dt_ = dt.datetime.strptime(date_str, expression) - timezone + dt.timedelta(hours=pm_delta)
-            return calendar.timegm(dt_.timetuple())
+            return calendar.timegm(dt_.timetuple()) + (dt_.microsecond / 1e6 if with_milliseconds else 0)

    timetuple = email.utils.parsedate_tz(date_str)
    if timetuple:
@ -2035,16 +2035,19 @@ def parse_duration(s):

    days, hours, mins, secs, ms = [None] * 5
    m = re.match(r'''(?x)
+            (?P<sign>[+-])?
            (?P<before_secs>
                (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
            (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
            (?P<ms>[.:][0-9]+)?Z?$
        ''', s)
    if m:
-        days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
+        sign, days, hours, mins, secs, ms = m.group('sign', 'days', 'hours', 'mins', 'secs', 'ms')
    else:
        m = re.match(
-            r'''(?ix)(?:P?
+            r'''(?ix)(?:
+                (?P<sign>[+-])?
+                P?
                (?:
                    [0-9]+\s*y(?:ears?)?,?\s*
                )?
@ -2068,17 +2071,19 @@ def parse_duration(s):
                    (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
                )?Z?$''', s)
        if m:
-            days, hours, mins, secs, ms = m.groups()
+            sign, days, hours, mins, secs, ms = m.groups()
        else:
-            m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
+            m = re.match(r'(?i)(?P<sign>[+-])?(?:(?P<days>[0-9.]+)\s*(?:days?)|(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
            if m:
-                hours, mins = m.groups()
+                sign, days, hours, mins = m.groups()
            else:
                return None

+    sign = -1 if sign == '-' else 1
+
    if ms:
        ms = ms.replace(':', '.')
-    return sum(float(part or 0) * mult for part, mult in (
+    return sign * sum(float(part or 0) * mult for part, mult in (
        (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
Author	SHA1	Message	Date
Elyse	e1e38a8ec4	Merge `172dfbeaed` into `12d8ea8246`	2024-05-17 21:58:17 +02:00
coletdjnz	12d8ea8246	[ie/youtube] Remove `android` from default clients (#9553 ) Closes #9554 Authored by: coletdjnz, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>	2024-05-17 16:03:02 +00:00
Justin Keogh	8e15177b41	[ie/youtube] Fix comments extraction (#9775 ) Closes #9358 Authored by: jakeogh, minamotorin, shoxie007, bbilly1 Co-authored-by: minamotorin <76122224+minamotorin@users.noreply.github.com> Co-authored-by: shoxie007 <74592022+shoxie007@users.noreply.github.com> Co-authored-by: Simon <35427372+bbilly1@users.noreply.github.com>	2024-05-17 14:37:30 +00:00
Roeniss Moon	dd9ad97b1f	[cookies] Add `--cookies-from-browser` support for Whale (#9649 ) Closes #9307 Authored by: roeniss	2024-05-17 14:33:12 +00:00
minamotorin	61b17437dc	[ie] Add POST data hash to `--write-pages` filenames (#9879 ) Closes #9773 Authored by: minamotorin	2024-05-17 14:28:36 +00:00
bashonly	172dfbeaed	Merge branch 'yt-dlp:master' into pr/live-sections	2024-05-10 13:52:35 -05:00
bashonly	cf96b24de6	Merge branch 'master' into yt-live-from-start-range	2024-04-16 11:01:17 -05:00
bashonly	50c943e8a0	Merge branch 'yt-dlp:master' into pr/yt-live-from-start-range	2024-03-19 15:18:22 -05:00
bashonly	6fc6349ef0	Merge branch 'master' into yt-live-from-start-range	2024-02-29 04:58:30 -06:00
bashonly	5156a16cf9	Merge branch 'master' into yt-live-from-start-range	2024-01-19 17:05:19 -06:00
Elyse	fb2b57a773	Merge remote-tracking branch 'github/yt-live-from-start-range' into yt-live-from-start-range	2023-10-08 01:01:31 -06:00
Elyse	2741b5827d	Merge remote-tracking branch 'origin' into yt-live-from-start-range	2023-10-08 00:24:29 -06:00
bashonly	bd730470f2	Cleanup	2023-07-22 13:32:10 -05:00
bashonly	194bc49c55	Merge branch 'yt-dlp:master' into pr/6498	2023-07-22 13:23:54 -05:00
bashonly	1416cee726	Update yt_dlp/options.py	2023-07-22 17:59:48 +00:00
Elyse	622c555356	Fix bug after merge	2023-06-24 14:43:50 -06:00
Elyse	99e6074c5d	Merge remote-tracking branch 'origin' into yt-live-from-start-range	2023-06-24 14:30:12 -06:00
Elyse	1f7974690e	Merge remote-tracking branch 'origin' into yt-live-from-start-range	2023-06-03 14:39:32 -06:00
Elyse	8ee942a9c8	Add warning about --download-sections without --live-from-start	2023-05-13 13:29:28 -06:00
Elyse	444e02ef3b	Merge remote-tracking branch 'origin/master' into yt-live-from-start-range	2023-05-07 00:33:18 -06:00
Elyse	4e93198ae6	Restore README.md I think this is auto-generated by some script	2023-05-06 23:29:40 -06:00
Elyse	78285eea86	Update options docs	2023-05-06 23:24:58 -06:00
Elyse	7f93eb7a28	Support for epoch timestamps	2023-05-06 23:05:38 -06:00
Elyse	128d30492b	Always compute last_seq	2023-04-18 23:17:39 -06:00
Elyse	129555b19a	Fix return values of _extract_sequence_from_mpd	2023-03-17 22:39:21 -06:00
Elyse	01f672fe27	Lock less agressively This gives a speed performance of about 30%	2023-03-17 22:37:31 -06:00
Elyse	2fbe18557b	Add some documentation	2023-03-12 01:42:45 -06:00
Elyse	b131f3d1f1	Improve option documentation	2023-03-12 01:37:33 -06:00
Elyse	544836de83	Allow days in parse_duration	2023-03-12 01:37:21 -06:00
pukkandan	6cea8cbe2d	Merge remote-tracking branch 'origin/master' into pr/6498	2023-03-12 11:57:41 +05:30
Elyse	5e4699a623	Fix linter	2023-03-11 20:02:52 -06:00
Elyse	79ae58a5c4	Fix linter	2023-03-11 20:00:34 -06:00
Elyse	3faa1e33ed	Add initial documentation	2023-03-11 19:51:14 -06:00
Elyse	fbae888c65	Add debug for selected section	2023-03-11 19:51:14 -06:00
Elyse	cdac7641d6	Remove tz_aware date code	2023-03-11 19:51:14 -06:00
Elyse	a43ba2eff6	Fix unified_timestamp	2023-03-11 19:51:14 -06:00
Elyse	0ed9a73a73	Add fragment count	2023-03-11 19:51:14 -06:00
Elyse	e40132da09	Revert "[utils] Allow using local timezone for 'now' timestamps" This reverts commit `1799a6ae36`.	2023-03-11 19:51:14 -06:00
Elyse	e6e2eb00f1	Support negative durations	2023-03-11 19:51:14 -06:00
pukkandan	9fc70f3f6d	[extractor/youtube] Construct fragment list lazily Building fragment list for all formats take significant time for large videos	2023-03-11 19:51:14 -06:00
pukkandan	5ef1a928a7	[extractor/youtube] Add extractor-arg `include_duplicate_formats`	2023-03-11 19:51:14 -06:00
Lesmiscore	db62ffdafe	[extractor/youtube] Add client name to `format_note` when `-v` (#6254 ) Authored by: Lesmiscore, pukkandan	2023-03-11 19:51:14 -06:00
vampirefrog	f137666451	[extractor/rokfin] Re-construct manifest url (#6507 ) Authored by: vampirefrog	2023-03-11 19:51:14 -06:00
Daniel Vogt	e3ffdf76aa	[extractor/opencast] Fix format bug (#6512 ) Authored by: C0D3D3V	2023-03-11 19:51:14 -06:00
pukkandan	9f717b69b4	[extractor/hidive] Fix login Fixes https://github.com/yt-dlp/yt-dlp/issues/6493#issuecomment-1462906556	2023-03-11 19:51:14 -06:00
pukkandan	34d3df72e9	Support loading info.json with a list at it's root	2023-03-11 19:51:14 -06:00
makeworld	96f5d29db0	[extractor/cbc:gem] Update `_VALID_URL` (#6499 ) Authored by: makeworld-the-better-one Closes #6395	2023-03-11 19:51:13 -06:00
Elyse	c222f6cbfc	[extractor/twitch] Fix `is_live` (#6500 ) Closes #6494 Authored by: elyse0	2023-03-11 19:51:13 -06:00
pukkandan	2d1655493f	[extractor/youtube] Bypass throttling for `-f17` and related cleanup Thanks @AudricV for the finding	2023-03-11 19:51:13 -06:00
pukkandan	c376b95f95	[downloader/curl] Fix progress reporting Bug in `8c53322cda` Closes #6490	2023-03-11 19:51:13 -06:00
Daniel Vogt	8df470761e	[extractor/opencast] Add ltitools to `_VALID_URL` (#6371 ) Authored by: C0D3D3V	2023-03-11 19:51:13 -06:00
D0LLYNH0	e3b08bac9c	[extractor/iq] Set more language codes (#6476 ) Authored by: D0LLYNH0	2023-03-11 19:51:13 -06:00
Elyse	932758707f	Fix linter	2023-03-09 18:51:10 -06:00
Elyse	317ba03fdf	Improve parse_chapters comments	2023-03-09 18:35:20 -06:00
Elyse	e42e25619f	Create last_segment_url only if necessary	2023-03-09 18:24:39 -06:00
Elyse	fba1c397b1	[youtube] Support --download-sections for YT Livestream from start	2023-03-09 17:32:19 -06:00
Elyse	b83d7526f2	Add fixme in modified parse_chapters function A range like '*(now-1hour)-(now-30minutes)' doesn't work	2023-03-09 17:21:02 -06:00
Elyse	fdb9aaf416	Use local timezone for download sections	2023-03-09 17:19:39 -06:00
Elyse	1799a6ae36	[utils] Allow using local timezone for 'now' timestamps	2023-03-09 17:18:44 -06:00
Elyse	367429e238	[common] Extract start and end keys for Dash fragments	2023-03-09 17:17:16 -06:00
Sophire	439be2b4a4	[utils] Add microseconds to unified_timestamp	2023-03-09 12:07:08 -06:00
Elyse	2fbd6de957	[utils] Add hackish 'now' support for --download-sections	2023-03-09 11:30:40 -06:00