Merge 6c8ede8188 into 8e15177b41

[ie/youtube] Fix comments extraction (#9775 )
Closes #9358 Authored by: jakeogh, minamotorin, shoxie007, bbilly1 Co-authored-by: minamotorin <76122224+minamotorin@users.noreply.github.com> Co-authored-by: shoxie007 <74592022+shoxie007@users.noreply.github.com> Co-authored-by: Simon <35427372+bbilly1@users.noreply.github.com>
2024-06-01 19:08:14 +02:00 · 2024-05-17 21:31:51 +05:30 · 2024-05-17 14:37:30 +00:00 · 2024-05-17 14:33:12 +00:00 · 2024-05-17 14:28:36 +00:00 · 2024-05-17 06:20:13 +00:00
11 changed files with 513 additions and 228 deletions
--- a/README.md
+++ b/README.md
@ -666,7 +666,7 @@ ## Filesystem Options:
                                    The name of the browser to load cookies
                                    from. Currently supported browsers are:
                                    brave, chrome, chromium, edge, firefox,
-                                    opera, safari, vivaldi. Optionally, the
+                                    opera, safari, vivaldi, whale. Optionally, the
                                    KEYRING used for decrypting Chromium cookies
                                    on Linux, the name/path of the PROFILE to
                                    load cookies from, and the CONTAINER name
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@ -874,32 +874,33 @@ def test_format_note(self):
        }), r'^30fps$')

    def test_postprocessors(self):
-        filename = 'post-processor-testfile.mp4'
-        audiofile = filename + '.mp3'
+        filename = 'post-processor-testfile'
+        video_file = filename + '.mp4'
+        audio_file = filename + '.mp3'

        class SimplePP(PostProcessor):
            def run(self, info):
-                with open(audiofile, 'w') as f:
+                with open(audio_file, 'w') as f:
                    f.write('EXAMPLE')
                return [info['filepath']], info

        def run_pp(params, PP):
-            with open(filename, 'w') as f:
+            with open(video_file, 'w') as f:
                f.write('EXAMPLE')
            ydl = YoutubeDL(params)
            ydl.add_post_processor(PP())
-            ydl.post_process(filename, {'filepath': filename})
+            ydl.post_process(video_file, {'filepath': video_file})

-        run_pp({'keepvideo': True}, SimplePP)
-        self.assertTrue(os.path.exists(filename), '%s doesn\'t exist' % filename)
-        self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile)
-        os.unlink(filename)
-        os.unlink(audiofile)
+        run_pp({'keepvideo': True, 'outtmpl': filename}, SimplePP)
+        self.assertTrue(os.path.exists(video_file), '%s doesn\'t exist' % video_file)
+        self.assertTrue(os.path.exists(audio_file), '%s doesn\'t exist' % audio_file)
+        os.unlink(video_file)
+        os.unlink(audio_file)

-        run_pp({'keepvideo': False}, SimplePP)
-        self.assertFalse(os.path.exists(filename), '%s exists' % filename)
-        self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile)
-        os.unlink(audiofile)
+        run_pp({'keepvideo': False, 'outtmpl': filename}, SimplePP)
+        self.assertFalse(os.path.exists(video_file), '%s exists' % video_file)
+        self.assertTrue(os.path.exists(audio_file), '%s doesn\'t exist' % audio_file)
+        os.unlink(audio_file)

        class ModifierPP(PostProcessor):
            def run(self, info):
@ -907,9 +908,9 @@ def run(self, info):
                    f.write('MODIFIED')
                return [], info

-        run_pp({'keepvideo': False}, ModifierPP)
-        self.assertTrue(os.path.exists(filename), '%s doesn\'t exist' % filename)
-        os.unlink(filename)
+        run_pp({'keepvideo': False, 'outtmpl': filename}, ModifierPP)
+        self.assertTrue(os.path.exists(video_file), '%s doesn\'t exist' % video_file)
+        os.unlink(video_file)

    def test_match_filter(self):
        first = {
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -3219,7 +3219,6 @@ def replace_info_dict(new_info):
        # info_dict['_filename'] needs to be set for backward compatibility
        info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
        temp_filename = self.prepare_filename(info_dict, 'temp')
-        files_to_move = {}

        # Forced printings
        self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
@ -3247,13 +3246,11 @@ def check_max_downloads():
        sub_files = self._write_subtitles(info_dict, temp_filename)
        if sub_files is None:
            return
-        files_to_move.update(dict(sub_files))

        thumb_files = self._write_thumbnails(
            'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail'))
        if thumb_files is None:
            return
-        files_to_move.update(dict(thumb_files))

        infofn = self.prepare_filename(info_dict, 'infojson')
        _infojson_written = self._write_info_json('video', info_dict, infofn)
@ -3327,13 +3324,12 @@ def _write_link_file(link_type):
               for link_type, should_write in write_links.items()):
            return

-        new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
+        new_info, _ = self.pre_process(info_dict, 'before_dl')
        replace_info_dict(new_info)

        if self.params.get('skip_download'):
            info_dict['filepath'] = temp_filename
            info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
-            info_dict['__files_to_move'] = files_to_move
            replace_info_dict(self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict))
            info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
        else:
@ -3447,9 +3443,6 @@ def correct_ext(filename, ext=new_ext):
                        info_dict['__files_to_merge'] = downloaded
                        # Even if there were no downloads, it is being merged only now
                        info_dict['__real_download'] = True
-                    else:
-                        for file in downloaded:
-                            files_to_move[file] = None
                else:
                    # Just a single file
                    dl_filename = existing_video_file(full_filename, temp_filename)
@ -3463,7 +3456,6 @@ def correct_ext(filename, ext=new_ext):

                dl_filename = dl_filename or temp_filename
                info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
-
            except network_exceptions as err:
                self.report_error('unable to download video data: %s' % error_to_compat_str(err))
                return
@ -3534,7 +3526,7 @@ def ffmpeg_fixup(cndn, msg, cls):

                fixup()
                try:
-                    replace_info_dict(self.post_process(dl_filename, info_dict, files_to_move))
+                    replace_info_dict(self.post_process(dl_filename, info_dict))
                except PostProcessingError as err:
                    self.report_error('Postprocessing: %s' % str(err))
                    return
@ -3655,8 +3647,6 @@ def _delete_downloaded_files(self, *files_to_delete, info={}, msg=None):
                os.remove(filename)
            except OSError:
                self.report_warning(f'Unable to delete file {filename}')
-            if filename in info.get('__files_to_move', []):  # NB: Delete even if None
-                del info['__files_to_move'][filename]

    @staticmethod
    def post_extract(info_dict):
@ -3673,8 +3663,7 @@ def actual_post_extract(info_dict):

    def run_pp(self, pp, infodict):
        files_to_delete = []
-        if '__files_to_move' not in infodict:
-            infodict['__files_to_move'] = {}
+
        try:
            files_to_delete, infodict = pp.run(infodict)
        except PostProcessingError as e:
@ -3686,10 +3675,7 @@ def run_pp(self, pp, infodict):

        if not files_to_delete:
            return infodict
-        if self.params.get('keepvideo', False):
-            for f in files_to_delete:
-                infodict['__files_to_move'].setdefault(f, '')
-        else:
+        if not self.params.get('keepvideo', False):
            self._delete_downloaded_files(
                *files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)')
        return infodict
@ -3702,23 +3688,27 @@ def run_all_pps(self, key, info, *, additional_pps=None):
        return info

    def pre_process(self, ie_info, key='pre_process', files_to_move=None):
+        if files_to_move is not None:
+            self.report_warning('[pre_process] "files_to_move" is deprecated and may be removed in a future version')
+
        info = dict(ie_info)
-        info['__files_to_move'] = files_to_move or {}
        try:
            info = self.run_all_pps(key, info)
        except PostProcessingError as err:
            msg = f'Preprocessing: {err}'
            info.setdefault('__pending_error', msg)
            self.report_error(msg, is_error=False)
-        return info, info.pop('__files_to_move', None)
+        return info, files_to_move

    def post_process(self, filename, info, files_to_move=None):
        """Run all the postprocessors on the given file."""
+        if files_to_move is not None:
+            self.report_warning('[post_process] "files_to_move" is deprecated and may be removed in a future version')
+
        info['filepath'] = filename
-        info['__files_to_move'] = files_to_move or {}
        info = self.run_all_pps('post_process', info, additional_pps=info.get('__postprocessors'))
        info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
-        del info['__files_to_move']
+        info.pop('__multiple_thumbnails', None)
        return self.run_all_pps('after_move', info)

    def _make_archive_id(self, info_dict):
@ -4305,10 +4295,11 @@ def _write_subtitles(self, info_dict, filename):
            sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
            sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext'))
            existing_sub = self.existing_file((sub_filename_final, sub_filename))
+
            if existing_sub:
                self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present')
                sub_info['filepath'] = existing_sub
-                ret.append((existing_sub, sub_filename_final))
+                ret.append(existing_sub)
                continue

            self.to_screen(f'[info] Writing video subtitles to: {sub_filename}')
@ -4319,7 +4310,7 @@ def _write_subtitles(self, info_dict, filename):
                    with open(sub_filename, 'w', encoding='utf-8', newline='') as subfile:
                        subfile.write(sub_info['data'])
                    sub_info['filepath'] = sub_filename
-                    ret.append((sub_filename, sub_filename_final))
+                    ret.append(sub_filename)
                    continue
                except OSError:
                    self.report_error(f'Cannot write video subtitles file {sub_filename}')
@ -4330,7 +4321,7 @@ def _write_subtitles(self, info_dict, filename):
                sub_copy.setdefault('http_headers', info_dict.get('http_headers'))
                self.dl(sub_filename, sub_copy, subtitle=True)
                sub_info['filepath'] = sub_filename
-                ret.append((sub_filename, sub_filename_final))
+                ret.append(sub_filename)
            except (DownloadError, ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
                msg = f'Unable to download video subtitles for {sub_lang!r}: {err}'
                if self.params.get('ignoreerrors') is not True:  # False or 'only_download'
@ -4350,6 +4341,7 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None
                self.to_screen(f'[info] There are no {label} thumbnails to download')
                return ret
        multiple = write_all and len(thumbnails) > 1
+        info_dict['__multiple_thumbnails'] = multiple

        if thumb_filename_base is None:
            thumb_filename_base = filename
@ -4371,7 +4363,7 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None
                self.to_screen('[info] %s is already present' % (
                    thumb_display_id if multiple else f'{label} thumbnail').capitalize())
                t['filepath'] = existing_thumb
-                ret.append((existing_thumb, thumb_filename_final))
+                ret.append(existing_thumb)
            else:
                self.to_screen(f'[info] Downloading {thumb_display_id} ...')
                try:
@ -4379,7 +4371,7 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None
                    self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}')
                    with open(encodeFilename(thumb_filename), 'wb') as thumbf:
                        shutil.copyfileobj(uf, thumbf)
-                    ret.append((thumb_filename, thumb_filename_final))
+                    ret.append(thumb_filename)
                    t['filepath'] = thumb_filename
                except network_exceptions as err:
                    if isinstance(err, HTTPError) and err.status == 404:
@ -4389,4 +4381,5 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None
                    thumbnails.pop(idx)
            if ret and not write_all:
                break
+
        return ret
--- a/yt_dlp/cookies.py
+++ b/yt_dlp/cookies.py
@ -46,7 +46,7 @@
 from .utils._utils import _YDLLogger
 from .utils.networking import normalize_url

-CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'}
+CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'whale'}
 SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'}


@ -219,6 +219,7 @@ def _get_chromium_based_browser_settings(browser_name):
            'edge': os.path.join(appdata_local, R'Microsoft\Edge\User Data'),
            'opera': os.path.join(appdata_roaming, R'Opera Software\Opera Stable'),
            'vivaldi': os.path.join(appdata_local, R'Vivaldi\User Data'),
+            'whale': os.path.join(appdata_local, R'Naver\Naver Whale\User Data'),
        }[browser_name]

    elif sys.platform == 'darwin':
@ -230,6 +231,7 @@ def _get_chromium_based_browser_settings(browser_name):
            'edge': os.path.join(appdata, 'Microsoft Edge'),
            'opera': os.path.join(appdata, 'com.operasoftware.Opera'),
            'vivaldi': os.path.join(appdata, 'Vivaldi'),
+            'whale': os.path.join(appdata, 'Naver/Whale'),
        }[browser_name]

    else:
@ -241,6 +243,7 @@ def _get_chromium_based_browser_settings(browser_name):
            'edge': os.path.join(config, 'microsoft-edge'),
            'opera': os.path.join(config, 'opera'),
            'vivaldi': os.path.join(config, 'vivaldi'),
+            'whale': os.path.join(config, 'naver-whale'),
        }[browser_name]

    # Linux keyring names can be determined by snooping on dbus while opening the browser in KDE:
@ -252,6 +255,7 @@ def _get_chromium_based_browser_settings(browser_name):
        'edge': 'Microsoft Edge' if sys.platform == 'darwin' else 'Chromium',
        'opera': 'Opera' if sys.platform == 'darwin' else 'Chromium',
        'vivaldi': 'Vivaldi' if sys.platform == 'darwin' else 'Chrome',
+        'whale': 'Whale',
    }[browser_name]

    browsers_without_profiles = {'opera'}
--- a/yt_dlp/extractor/bbc.py
+++ b/yt_dlp/extractor/bbc.py
@ -602,7 +602,7 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
        'url': 'http://www.bbc.com/news/world-europe-32668511',
        'info_dict': {
            'id': 'world-europe-32668511',
-            'title': 'Russia stages massive WW2 parade',
+            'title': 'Russia stages massive WW2 parade despite Western boycott',
            'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
        },
        'playlist_count': 2,
@ -623,6 +623,7 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
        'info_dict': {
            'id': '3662a707-0af9-3149-963f-47bea720b460',
            'title': 'BUGGER',
+            'description': r're:BUGGER  The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$',
        },
        'playlist_count': 18,
    }, {
@ -631,14 +632,14 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
        'info_dict': {
            'id': 'p02mprgb',
            'ext': 'mp4',
-            'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
-            'description': 'md5:2868290467291b37feda7863f7a83f54',
+            'title': 'Germanwings crash site aerial video',
+            'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$',
            'duration': 47,
            'timestamp': 1427219242,
            'upload_date': '20150324',
+            'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg',
        },
        'params': {
-            # rtmp download
            'skip_download': True,
        }
    }, {
@ -656,21 +657,24 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
        },
        'params': {
            'skip_download': True,
-        }
+        },
+        'skip': 'now SIMORGH_DATA with no video',
    }, {
        # single video embedded with data-playable containing XML playlists (regional section)
        'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
        'info_dict': {
-            'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
+            'id': '39275083',
+            'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
            'ext': 'mp4',
            'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
-            'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
+            'description': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
            'timestamp': 1434713142,
            'upload_date': '20150619',
+            'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg',
        },
        'params': {
            'skip_download': True,
-        }
+        },
    }, {
        # single video from video playlist embedded with vxp-playlist-data JSON
        'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
@ -683,22 +687,21 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
        },
        'params': {
            'skip_download': True,
-        }
+        },
+        'skip': '404 Not Found',
    }, {
-        # single video story with digitalData
+        # single video story with __PWA_PRELOADED_STATE__
        'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
        'info_dict': {
            'id': 'p02q6gc4',
-            'ext': 'flv',
-            'title': 'Sri Lanka’s spicy secret',
-            'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
-            'timestamp': 1437674293,
-            'upload_date': '20150723',
+            'ext': 'mp4',
+            'title': 'Tasting the spice of life in Jaffna',
+            'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{151} aftertaste\.$',
+            'timestamp': 1646058397,
+            'upload_date': '20220228',
+            'duration': 255,
+            'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg',
        },
-        'params': {
-            # rtmp download
-            'skip_download': True,
-        }
    }, {
        # single video story without digitalData
        'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
@ -710,12 +713,10 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
            'timestamp': 1415867444,
            'upload_date': '20141113',
        },
-        'params': {
-            # rtmp download
-            'skip_download': True,
-        }
+        'skip': 'redirects to TopGear home page',
    }, {
        # single video embedded with Morph
+        # TODO: replacement test page
        'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
        'info_dict': {
            'id': 'p041vhd0',
@ -726,27 +727,22 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
            'uploader': 'BBC Sport',
            'uploader_id': 'bbc_sport',
        },
-        'params': {
-            # m3u8 download
-            'skip_download': True,
-        },
-        'skip': 'Georestricted to UK',
+        'skip': 'Video no longer in page',
    }, {
-        # single video with playlist.sxml URL in playlist param
+        # single video in __INITIAL_DATA__
        'url': 'http://www.bbc.com/sport/0/football/33653409',
        'info_dict': {
            'id': 'p02xycnp',
            'ext': 'mp4',
-            'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
-            'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
+            'title': 'Ronaldo to Man Utd, Arsenal to spend?',
+            'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$',
+            'timestamp': 1437750175,
+            'upload_date': '20150724',
+            'thumbnail': r're:https?://.+/.+media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png',
            'duration': 140,
        },
-        'params': {
-            # rtmp download
-            'skip_download': True,
-        }
    }, {
-        # article with multiple videos embedded with playlist.sxml in playlist param
+        # article with multiple videos embedded with Morph.setPayload
        'url': 'http://www.bbc.com/sport/0/football/34475836',
        'info_dict': {
            'id': '34475836',
@ -754,6 +750,21 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
            'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
        },
        'playlist_count': 3,
+    }, {
+        # Testing noplaylist
+        'url': 'http://www.bbc.com/sport/0/football/34475836',
+        'info_dict': {
+            'id': 'p034ppnv',
+            'ext': 'mp4',
+            'title': 'All you need to know about Jurgen Klopp',
+            'timestamp': 1444335081,
+            'upload_date': '20151008',
+            'duration': 122.0,
+            'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg',
+        },
+        'params': {
+            'noplaylist': True,
+        },
    }, {
        # school report article with single video
        'url': 'http://www.bbc.co.uk/schoolreport/35744779',
@ -762,6 +773,7 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
            'title': 'School which breaks down barriers in Jerusalem',
        },
        'playlist_count': 1,
+        'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt',
    }, {
        # single video with playlist URL from weather section
        'url': 'http://www.bbc.com/weather/features/33601775',
@ -778,18 +790,33 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
            'thumbnail': r're:https?://.+/.+\.jpg',
            'timestamp': 1437785037,
            'upload_date': '20150725',
+            'duration': 105,
        },
    }, {
        # video with window.__INITIAL_DATA__ and value as JSON string
        'url': 'https://www.bbc.com/news/av/world-europe-59468682',
        'info_dict': {
-            'id': 'p0b71qth',
+            'id': 'p0b779gc',
            'ext': 'mp4',
            'title': 'Why France is making this woman a national hero',
-            'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
+            'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{208} Second World War.',
            'thumbnail': r're:https?://.+/.+\.jpg',
-            'timestamp': 1638230731,
-            'upload_date': '20211130',
+            'timestamp': 1638215626,
+            'upload_date': '20211129',
+            'duration': 125,
+        },
+    }, {
+        # video with script id __NEXT_DATA__ and value as JSON string
+        'url': 'https://www.bbc.com/news/uk-68546268',
+        'info_dict': {
+            'id': 'p0hj0lq7',
+            'ext': 'mp4',
+            'title': 'Nasser Hospital doctor describes his treatment by IDF',
+            'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$',
+            'thumbnail': r're:https?://.+/.+\.jpg',
+            'timestamp': 1710188248,
+            'upload_date': '20240311',
+            'duration': 104,
        },
    }, {
        # single video article embedded with data-media-vpid
@ -817,6 +844,7 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
            'uploader': 'Radio 3',
            'uploader_id': 'bbc_radio_three',
        },
+        'skip': '404 Not Found',
    }, {
        'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
        'info_dict': {
@ -824,6 +852,7 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
            'ext': 'mp4',
            'title': 'md5:2fabf12a726603193a2879a055f72514',
            'description': 'Learn English words and phrases from this story',
+            'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg',
        },
        'add_ie': [BBCCoUkIE.ie_key()],
    }, {
@ -832,28 +861,30 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
        'info_dict': {
            'id': 'p07c6sb9',
            'ext': 'mp4',
-            'title': 'How positive thinking is harming your happiness',
-            'alt_title': 'The downsides of positive thinking',
-            'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
+            'title': 'The downsides of positive thinking',
+            'description': 'The downsides of positive thinking',
            'duration': 235,
-            'thumbnail': r're:https?://.+/p07c9dsr.jpg',
-            'upload_date': '20190604',
-            'categories': ['Psychology'],
+            'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)',
+            'upload_date': '20220223',
+            'timestamp': 1645632746,
        },
    }, {
        # BBC Sounds
-        'url': 'https://www.bbc.co.uk/sounds/play/m001q78b',
+        'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx',
        'info_dict': {
-            'id': 'm001q789',
+            'id': 'p0hrw4nr',
            'ext': 'mp4',
-            'title': 'The Night Tracks Mix - Music for the darkling hour',
-            'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg',
-            'chapters': 'count:8',
-            'description': 'md5:815fb51cbdaa270040aab8145b3f1d67',
-            'uploader': 'Radio 3',
-            'duration': 1800,
-            'uploader_id': 'bbc_radio_three',
-        },
+            'title': 'Are our coastlines being washed away?',
+            'description': r're:(?s)Around the world, coastlines are constantly changing .{2000,} Images\)$',
+            'timestamp': 1713556800,
+            'upload_date': '20240419',
+            'duration': 1588,
+            'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg',
+            'uploader': 'World Service',
+            'uploader_id': 'bbc_world_service',
+            'series': 'CrowdScience',
+            'chapters': [],
+        }
    }, {  # onion routes
        'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
        'only_matching': True,
@ -1008,8 +1039,7 @@ def _real_extract(self, url):
            webpage, 'group id', default=None)
        if group_id:
            return self.url_result(
-                'https://www.bbc.co.uk/programmes/%s' % group_id,
-                ie=BBCCoUkIE.ie_key())
+                f'https://www.bbc.co.uk/programmes/{group_id}', BBCCoUkIE)

        # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
        programme_id = self._search_regex(
@ -1069,83 +1099,133 @@ def _real_extract(self, url):
                }

        # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
-        # There are several setPayload calls may be present but the video
-        # seems to be always related to the first one
-        morph_payload = self._parse_json(
-            self._search_regex(
-                r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
-                webpage, 'morph payload', default='{}'),
-            playlist_id, fatal=False)
+        # Several setPayload calls may be present but the video(s)
+        # should be in one that mentions leadMedia or videoData
+        morph_payload = self._search_json(
+            r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id,
+            contains_pattern=r'{(?s:(?:(?!</script>).)+(?:"leadMedia"|\\"videoData\\")\s*:.+)}',
+            default={})
        if morph_payload:
-            components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
-            for component in components:
-                if not isinstance(component, dict):
-                    continue
-                lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
-                if not lead_media:
-                    continue
-                identifiers = lead_media.get('identifiers')
-                if not identifiers or not isinstance(identifiers, dict):
-                    continue
-                programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
+            for lead_media in traverse_obj(morph_payload, (
+                    'body', 'components', ..., 'props', 'leadMedia', {dict})):
+                programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any))
                if not programme_id:
                    continue
-                title = lead_media.get('title') or self._og_search_title(webpage)
                formats, subtitles = self._download_media_selector(programme_id)
-                description = lead_media.get('summary')
-                uploader = lead_media.get('masterBrand')
-                uploader_id = lead_media.get('mid')
-                duration = None
-                duration_d = lead_media.get('duration')
-                if isinstance(duration_d, dict):
-                    duration = parse_duration(dict_get(
-                        duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
                return {
                    'id': programme_id,
-                    'title': title,
-                    'description': description,
-                    'duration': duration,
-                    'uploader': uploader,
-                    'uploader_id': uploader_id,
+                    'title': lead_media.get('title') or self._og_search_title(webpage),
+                    **traverse_obj(lead_media, {
+                        'description': ('summary', {str}),
+                        'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}),
+                        'uploader': ('masterBrand', {str}),
+                        'uploader_id': ('mid', {str}),
+                    }),
                    'formats': formats,
                    'subtitles': subtitles,
                }
+            body = self._parse_json(traverse_obj(morph_payload, (
+                'body', 'content', 'article', 'body')), playlist_id, fatal=False)
+            for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')):
+                if video_data.get('vpid'):
+                    video_id = video_data['vpid']
+                    formats, subtitles = self._download_media_selector(video_id)
+                    entry = {
+                        'id': video_id,
+                        'formats': formats,
+                        'subtitles': subtitles,
+                    }
+                else:
+                    video_id = video_data['pid']
+                    entry = self.url_result(
+                        f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE,
+                        video_id, url_transparent=True)
+                entry.update({
+                    'timestamp': traverse_obj(morph_payload, (
+                        'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601})
+                    ),
+                    **traverse_obj(video_data, {
+                        'thumbnail': (('iChefImage', 'image'), {url_or_none}, any),
+                        'title': (('title', 'caption'), {str}, any),
+                        'duration': ('duration', {parse_duration}),
+                    }),
+                })
+                if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id):
+                    return entry
+                entries.append(entry)
+            if entries:
+                playlist_title = traverse_obj(morph_payload, (
+                    'body', 'content', 'article', 'headline', {str})) or playlist_title
+                return self.playlist_result(
+                    entries, playlist_id, playlist_title, playlist_description)

-        preload_state = self._parse_json(self._search_regex(
-            r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
-            'preload state', default='{}'), playlist_id, fatal=False)
-        if preload_state:
-            current_programme = preload_state.get('programmes', {}).get('current') or {}
-            programme_id = current_programme.get('id')
-            if current_programme and programme_id and current_programme.get('type') == 'playable_item':
-                title = current_programme.get('titles', {}).get('tertiary') or playlist_title
-                formats, subtitles = self._download_media_selector(programme_id)
-                synopses = current_programme.get('synopses') or {}
-                network = current_programme.get('network') or {}
-                duration = int_or_none(
-                    current_programme.get('duration', {}).get('value'))
-                thumbnail = None
-                image_url = current_programme.get('image_url')
-                if image_url:
-                    thumbnail = image_url.replace('{recipe}', 'raw')
+        # various PRELOADED_STATE JSON
+        preload_state = self._search_json(
+            r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage,
+            'preload state', playlist_id, transform_source=js_to_json, default={})
+        # PRELOADED_STATE with current programmme
+        current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict}))
+        programme_id = traverse_obj(current_programme, ('id', {str}))
+        if programme_id and current_programme.get('type') == 'playable_item':
+            title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title
+            formats, subtitles = self._download_media_selector(programme_id)
+            return {
+                'id': programme_id,
+                'title': title,
+                'formats': formats,
+                **traverse_obj(current_programme, {
+                    'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
+                    'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}),
+                    'duration': ('duration', 'value', {int_or_none}),
+                    'uploader': ('network', 'short_title', {str}),
+                    'uploader_id': ('network', 'id', {str}),
+                    'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any),
+                    'series': ('titles', 'primary', {str}),
+                }),
+                'subtitles': subtitles,
+                'chapters': traverse_obj(preload_state, (
+                    'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), {
+                        'title': ('titles', {lambda x: join_nonempty(
+                            'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
+                        'start_time': ('offset', 'start', {float_or_none}),
+                        'end_time': ('offset', 'end', {float_or_none}),
+                    })
+                ),
+            }
+
+        # PWA_PRELOADED_STATE with article video asset
+        asset_id = traverse_obj(preload_state, (
+            'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id,
+            'assetVideo', 0, {str}, any))
+        if asset_id:
+            video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str}))
+            if video_id:
+                article = traverse_obj(preload_state, (
+                    'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any))
+
+                def image_url(image_id):
+                    return traverse_obj(preload_state, (
+                        'entities', 'images', image_id, 'url',
+                        {lambda u: url_or_none(u.replace('$recipe', 'raw'))}))
+
+                formats, subtitles = self._download_media_selector(video_id)
                return {
-                    'id': programme_id,
-                    'title': title,
-                    'description': dict_get(synopses, ('long', 'medium', 'short')),
-                    'thumbnail': thumbnail,
-                    'duration': duration,
-                    'uploader': network.get('short_title'),
-                    'uploader_id': network.get('id'),
+                    'id': video_id,
+                    **traverse_obj(preload_state, ('entities', 'videos', asset_id, {
+                        'title': ('title', {str}),
+                        'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any),
+                        'thumbnail': (0, {image_url}),
+                        'duration': ('duration', {int_or_none}),
+                    })),
                    'formats': formats,
                    'subtitles': subtitles,
-                    'chapters': traverse_obj(preload_state, (
-                        'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), {
-                            'title': ('titles', {lambda x: join_nonempty(
-                                'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
-                            'start_time': ('offset', 'start', {float_or_none}),
-                            'end_time': ('offset', 'end', {float_or_none}),
-                        })) or None,
+                    'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})),
                }
+            else:
+                return self.url_result(
+                    f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE,
+                    asset_id, playlist_title, display_id=playlist_id,
+                    description=playlist_description)

        bbc3_config = self._parse_json(
            self._search_regex(
@ -1191,6 +1271,28 @@ def _real_extract(self, url):
                return self.playlist_result(
                    entries, playlist_id, playlist_title, playlist_description)

+        def parse_model(model):
+            """Extract single video from model structure"""
+            item_id = traverse_obj(model, ('versions', 0, 'versionId', {str}))
+            if not item_id:
+                return
+            formats, subtitles = self._download_media_selector(item_id)
+            return {
+                'id': item_id,
+                'formats': formats,
+                'subtitles': subtitles,
+                **traverse_obj(model, {
+                    'title': ('title', {str}),
+                    'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
+                    'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any),
+                    'duration': ('versions', 0, 'duration', {int}),
+                    'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}),
+                })
+            }
+
+        def is_type(*types):
+            return lambda _, v: v['type'] in types
+
        initial_data = self._search_regex(
            r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
            'quoted preload state', default=None)
@ -1202,6 +1304,19 @@ def _real_extract(self, url):
            initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
        initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
        if initial_data:
+            for video_data in traverse_obj(initial_data, (
+                    'stores', 'article', 'articleBodyContent', is_type('video'))):
+                model = traverse_obj(video_data, (
+                    'model', 'blocks', is_type('aresMedia'),
+                    'model', 'blocks', is_type('aresMediaMetadata'),
+                    'model', {dict}, any))
+                entry = parse_model(model)
+                if entry:
+                    entries.append(entry)
+            if entries:
+                return self.playlist_result(
+                    entries, playlist_id, playlist_title, playlist_description)
+
            def parse_media(media):
                if not media:
                    return
@ -1234,27 +1349,90 @@ def parse_media(media):
                        'subtitles': subtitles,
                        'timestamp': item_time,
                        'description': strip_or_none(item_desc),
+                        'duration': int_or_none(item.get('duration')),
                    })
-            for resp in (initial_data.get('data') or {}).values():
-                name = resp.get('name')
+
+            for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])):
+                name = resp['name']
                if name == 'media-experience':
                    parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
                elif name == 'article':
-                    for block in (try_get(resp,
-                                          (lambda x: x['data']['blocks'],
-                                           lambda x: x['data']['content']['model']['blocks'],),
-                                          list) or []):
-                        if block.get('type') not in ['media', 'video']:
-                            continue
-                        parse_media(block.get('model'))
+                    for block in traverse_obj(resp, (
+                            'data', (None, ('content', 'model')), 'blocks',
+                            is_type('media', 'video'), 'model', {dict})):
+                        parse_media(block)
            return self.playlist_result(
                entries, playlist_id, playlist_title, playlist_description)

+        # extract from SIMORGH_DATA hydration JSON
+        simorgh_data = self._search_json(
+            r'window\s*\.\s*SIMORGH_DATA\s*=', webpage,
+            'simorgh data', playlist_id, default={})
+        if simorgh_data:
+            done = False
+            for video_data in traverse_obj(simorgh_data, (
+                    'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))):
+                model = traverse_obj(video_data, (
+                    'model', 'blocks', is_type('aresMedia'),
+                    'model', 'blocks', is_type('aresMediaMetadata'),
+                    'model', {dict}, any))
+                if video_data['type'] == 'video':
+                    entry = parse_model(model)
+                else:  # legacyMedia: no duration, subtitles
+                    block_id, entry = traverse_obj(model, ('blockId', {str})), None
+                    media_data = traverse_obj(simorgh_data, (
+                        'pageData', 'promo', 'media',
+                        {lambda x: x if x['id'] == block_id else None}))
+                    formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), {
+                        'url': ('url', {url_or_none}),
+                        'ext': ('format', {str}),
+                        'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}),
+                    }))
+                    if formats:
+                        entry = {
+                            'id': block_id,
+                            'display_id': playlist_id,
+                            'formats': formats,
+                            'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})),
+                            **traverse_obj(model, {
+                                'title': ('title', {str}),
+                                'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
+                                'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
+                                'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}),
+                            }),
+                        }
+                        done = True
+                if entry:
+                    entries.append(entry)
+                if done:
+                    break
+            if entries:
+                return self.playlist_result(
+                    entries, playlist_id, playlist_title, playlist_description)
+
        def extract_all(pattern):
            return list(filter(None, map(
                lambda s: self._parse_json(s, playlist_id, fatal=False),
                re.findall(pattern, webpage))))

+        # US accessed article with single embedded video (e.g.
+        # https://www.bbc.com/news/uk-68546268)
+        next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}),
+                                 ('props', 'pageProps', 'page'))
+        model = traverse_obj(next_data, (
+            ..., 'contents', is_type('video'),
+            'model', 'blocks', is_type('media'),
+            'model', 'blocks', is_type('mediaMetadata'),
+            'model', {dict}, any))
+        if model and (entry := parse_model(model)):
+            if not entry.get('timestamp'):
+                entry['timestamp'] = traverse_obj(next_data, (
+                    ..., 'contents', is_type('timestamp'), 'model',
+                    'timestamp', {functools.partial(int_or_none, scale=1000)}, any))
+            entries.append(entry)
+            return self.playlist_result(
+                entries, playlist_id, playlist_title, playlist_description)
+
        # Multiple video article (e.g.
        # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
        EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -957,7 +957,8 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=
        if urlh is False:
            assert not fatal
            return False
-        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
+        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
+                                             encoding=encoding, data=data)
        return (content, urlh)

    @staticmethod
@ -1005,8 +1006,10 @@ def __check_blocked(self, content):
                'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
                expected=True)

-    def _request_dump_filename(self, url, video_id):
-        basen = f'{video_id}_{url}'
+    def _request_dump_filename(self, url, video_id, data=None):
+        if data is not None:
+            data = hashlib.md5(data).hexdigest()
+        basen = join_nonempty(video_id, data, url, delim='_')
        trim_length = self.get_param('trim_file_name') or 240
        if len(basen) > trim_length:
            h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
@ -1028,16 +1031,18 @@ def __decode_webpage(self, webpage_bytes, encoding, headers):
        except LookupError:
            return webpage_bytes.decode('utf-8', 'replace')

-    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
+    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
+                              prefix=None, encoding=None, data=None):
        webpage_bytes = urlh.read()
        if prefix is not None:
            webpage_bytes = prefix + webpage_bytes
+        url_or_request = self._create_request(url_or_request, data)
        if self.get_param('dump_intermediate_pages', False):
            self.to_screen('Dumping request to ' + urlh.url)
            dump = base64.b64encode(webpage_bytes).decode('ascii')
            self._downloader.to_screen(dump)
        if self.get_param('write_pages'):
-            filename = self._request_dump_filename(urlh.url, video_id)
+            filename = self._request_dump_filename(urlh.url, video_id, url_or_request.data)
            self.to_screen(f'Saving request to {filename}')
            with open(filename, 'wb') as outf:
                outf.write(webpage_bytes)
@ -1098,7 +1103,7 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote,
                             impersonate=None, require_impersonation=False):
            if self.get_param('load_pages'):
                url_or_request = self._create_request(url_or_request, data, headers, query)
-                filename = self._request_dump_filename(url_or_request.url, video_id)
+                filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
                self.to_screen(f'Loading request from {filename}')
                try:
                    with open(filename, 'rb') as dumpf:
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -3317,7 +3317,36 @@ def _extract_heatmap(self, data):
                'value': ('intensityScoreNormalized', {float_or_none}),
            })) or None

-    def _extract_comment(self, comment_renderer, parent=None):
+    def _extract_comment(self, entities, parent=None):
+        comment_entity_payload = get_first(entities, ('payload', 'commentEntityPayload', {dict}))
+        if not (comment_id := traverse_obj(comment_entity_payload, ('properties', 'commentId', {str}))):
+            return
+
+        toolbar_entity_payload = get_first(entities, ('payload', 'engagementToolbarStateEntityPayload', {dict}))
+        time_text = traverse_obj(comment_entity_payload, ('properties', 'publishedTime', {str})) or ''
+
+        return {
+            'id': comment_id,
+            'parent': parent or 'root',
+            **traverse_obj(comment_entity_payload, {
+                'text': ('properties', 'content', 'content', {str}),
+                'like_count': ('toolbar', 'likeCountA11y', {parse_count}),
+                'author_id': ('author', 'channelId', {self.ucid_or_none}),
+                'author': ('author', 'displayName', {str}),
+                'author_thumbnail': ('author', 'avatarThumbnailUrl', {url_or_none}),
+                'author_is_uploader': ('author', 'isCreator', {bool}),
+                'author_is_verified': ('author', 'isVerified', {bool}),
+                'author_url': ('author', 'channelCommand', 'innertubeCommand', (
+                    ('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url')
+                ), {lambda x: urljoin('https://www.youtube.com', x)}),
+            }, get_all=False),
+            'is_favorited': (None if toolbar_entity_payload is None else
+                             toolbar_entity_payload.get('heartState') == 'TOOLBAR_HEART_STATE_HEARTED'),
+            '_time_text': time_text,  # FIXME: non-standard, but we need a way of showing that it is an estimate.
+            'timestamp': self._parse_time_text(time_text),
+        }
+
+    def _extract_comment_old(self, comment_renderer, parent=None):
        comment_id = comment_renderer.get('commentId')
        if not comment_id:
            return
@ -3398,21 +3427,39 @@ def extract_header(contents):
                break
            return _continuation

-        def extract_thread(contents):
+        def extract_thread(contents, entity_payloads):
            if not parent:
                tracker['current_page_thread'] = 0
            for content in contents:
                if not parent and tracker['total_parent_comments'] >= max_parents:
                    yield
                comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
-                comment_renderer = get_first(
-                    (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
-                    expected_type=dict, default={})

-                comment = self._extract_comment(comment_renderer, parent)
+                # old comment format
+                if not entity_payloads:
+                    comment_renderer = get_first(
+                        (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
+                        expected_type=dict, default={})
+
+                    comment = self._extract_comment_old(comment_renderer, parent)
+
+                # new comment format
+                else:
+                    view_model = (
+                        traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel', {dict}))
+                        or traverse_obj(content, ('commentViewModel', {dict})))
+                    comment_keys = traverse_obj(view_model, (('commentKey', 'toolbarStateKey'), {str}))
+                    if not comment_keys:
+                        continue
+                    entities = traverse_obj(entity_payloads, lambda _, v: v['entityKey'] in comment_keys)
+                    comment = self._extract_comment(entities, parent)
+                    if comment:
+                        comment['is_pinned'] = traverse_obj(view_model, ('pinnedText', {str})) is not None
+
                if not comment:
                    continue
                comment_id = comment['id']
+
                if comment.get('is_pinned'):
                    tracker['pinned_comment_ids'].add(comment_id)
                # Sometimes YouTube may break and give us infinite looping comments.
@ -3505,7 +3552,7 @@ def extract_thread(contents):
            check_get_keys = None
            if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0):
                check_get_keys = [[*continuation_items_path, ..., (
-                    'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]]
+                    'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentViewModel', 'commentRenderer'))]]
            try:
                response = self._extract_response(
                    item_id=None, query=continuation,
@ -3529,6 +3576,7 @@ def extract_thread(contents):
                raise
            is_forced_continuation = False
            continuation = None
+            mutations = traverse_obj(response, ('frameworkUpdates', 'entityBatchUpdate', 'mutations', ..., {dict}))
            for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]):
                if is_first_continuation:
                    continuation = extract_header(continuation_items)
@ -3537,7 +3585,7 @@ def extract_thread(contents):
                        break
                    continue

-                for entry in extract_thread(continuation_items):
+                for entry in extract_thread(continuation_items, mutations):
                    if not entry:
                        return
                    yield entry
--- a/yt_dlp/postprocessor/embedthumbnail.py
+++ b/yt_dlp/postprocessor/embedthumbnail.py
@ -224,4 +224,8 @@ def run(self, info):
            thumbnail_filename if converted or not self._already_have_thumbnail else None,
            original_thumbnail if converted and not self._already_have_thumbnail else None,
            info=info)
+
+        if not self._already_have_thumbnail:
+            info['thumbnails'][idx].pop('filepath', None)
+
        return [], info
--- a/yt_dlp/postprocessor/ffmpeg.py
+++ b/yt_dlp/postprocessor/ffmpeg.py
@ -662,6 +662,10 @@ def run(self, info):
        self.run_ffmpeg_multiple_files(input_files, temp_filename, opts)
        os.replace(temp_filename, filename)

+        if not self._already_have_subtitle:
+            for _, subtitle in subtitles.items():
+                subtitle.pop('filepath', None)
+
        files_to_delete = [] if self._already_have_subtitle else sub_filenames
        return files_to_delete, info

@ -698,6 +702,7 @@ def run(self, info):
                infojson_filename = info.get('infojson_filename')
                options.extend(self._get_infojson_opts(info, infojson_filename))
                if not infojson_filename:
+                    info.pop('infojson_filename', None)
                    files_to_delete.append(info.get('infojson_filename'))
            elif self._add_infojson is True:
                self.to_screen('The info-json can only be attached to mkv/mka files')
@ -1016,9 +1021,6 @@ def run(self, info):
                    'filepath': new_file,
                }

-            info['__files_to_move'][new_file] = replace_extension(
-                info['__files_to_move'][sub['filepath']], new_ext)
-
        return sub_filenames, info


@ -1083,16 +1085,15 @@ def is_webp(cls, path):
        return imghdr.what(path) == 'webp'

    def fixup_webp(self, info, idx=-1):
-        thumbnail_filename = info['thumbnails'][idx]['filepath']
+        thumbnail = info['thumbnails'][idx]
+        thumbnail_filename = thumbnail['filepath']
        _, thumbnail_ext = os.path.splitext(thumbnail_filename)
        if thumbnail_ext:
            if thumbnail_ext.lower() != '.webp' and imghdr.what(thumbnail_filename) == 'webp':
                self.to_screen('Correcting thumbnail "%s" extension to webp' % thumbnail_filename)
                webp_filename = replace_extension(thumbnail_filename, 'webp')
                os.replace(thumbnail_filename, webp_filename)
-                info['thumbnails'][idx]['filepath'] = webp_filename
-                info['__files_to_move'][webp_filename] = replace_extension(
-                    info['__files_to_move'].pop(thumbnail_filename), 'webp')
+                thumbnail['filepath'] = webp_filename

    @staticmethod
    def _options(target_ext):
@ -1130,8 +1131,6 @@ def run(self, info):
                continue
            thumbnail_dict['filepath'] = self.convert_thumbnail(original_thumbnail, target_ext)
            files_to_delete.append(original_thumbnail)
-            info['__files_to_move'][thumbnail_dict['filepath']] = replace_extension(
-                info['__files_to_move'][original_thumbnail], target_ext)

        if not has_thumbnail:
            self.to_screen('There aren\'t any thumbnails to convert')
--- a/yt_dlp/postprocessor/movefilesafterdownload.py
+++ b/yt_dlp/postprocessor/movefilesafterdownload.py
@ -1,16 +1,22 @@
 import os
+from pathlib import Path

 from .common import PostProcessor
 from ..compat import shutil
 from ..utils import (
    PostProcessingError,
-    decodeFilename,
-    encodeFilename,
    make_dir,
+    replace_extension
 )


 class MoveFilesAfterDownloadPP(PostProcessor):
+    # Map of the keys that contain moveable files and the 'type' of the file
+    # for generating the output filename
+    CHILD_KEYS = {
+        'thumbnails': 'thumbnail',
+        'requested_subtitles': 'subtitle'
+    }

    def __init__(self, downloader=None, downloaded=True):
        PostProcessor.__init__(self, downloader)
@ -20,34 +26,79 @@ def __init__(self, downloader=None, downloaded=True):
    def pp_key(cls):
        return 'MoveFiles'

+    def move_file_and_write_to_info(self, info_dict, relevant_dict=None, output_file_type=None):
+        relevant_dict = relevant_dict or info_dict
+        if 'filepath' not in relevant_dict:
+            return
+
+        output_file_type = output_file_type or ''
+        current_filepath, final_filepath = self.determine_filepath(info_dict, relevant_dict, output_file_type)
+        move_result = self.move_file(info_dict, current_filepath, final_filepath)
+
+        if move_result:
+            relevant_dict['filepath'] = move_result
+        else:
+            del relevant_dict['filepath']
+
+    def determine_filepath(self, info_dict, relevant_dict, output_file_type):
+        current_filepath = relevant_dict['filepath']
+        prepared_filepath = self._downloader.prepare_filename(info_dict, output_file_type)
+
+        if (output_file_type == 'thumbnail' and info_dict['__multiple_thumbnails']) or output_file_type == 'subtitle':
+            desired_extension = ''.join(Path(current_filepath).suffixes[-2:])
+        else:
+            desired_extension = Path(current_filepath).suffix
+
+        return current_filepath, replace_extension(prepared_filepath, desired_extension)
+
+    def move_file(self, info_dict, current_filepath, final_filepath):
+        if not current_filepath or not final_filepath:
+            return
+
+        dl_parent_folder = os.path.split(info_dict['filepath'])[0]
+        finaldir = info_dict.get('__finaldir', os.path.abspath(dl_parent_folder))
+
+        if not os.path.isabs(current_filepath):
+            current_filepath = os.path.join(finaldir, current_filepath)
+
+        if not os.path.isabs(final_filepath):
+            final_filepath = os.path.join(finaldir, final_filepath)
+
+        if current_filepath == final_filepath:
+            return final_filepath
+
+        if not os.path.exists(current_filepath):
+            self.report_warning('File "%s" cannot be found' % current_filepath)
+            return
+
+        if os.path.exists(final_filepath):
+            if self.get_param('overwrites', True):
+                self.report_warning('Replacing existing file "%s"' % final_filepath)
+                os.remove(final_filepath)
+            else:
+                self.report_warning(
+                    'Cannot move file "%s" out of temporary directory since "%s" already exists. '
+                    % (current_filepath, final_filepath))
+                return
+
+        make_dir(final_filepath, PostProcessingError)
+        self.to_screen(f'Moving file "{current_filepath}" to "{final_filepath}"')
+        shutil.move(current_filepath, final_filepath)  # os.rename cannot move between volumes
+
+        return final_filepath
+
    def run(self, info):
-        dl_path, dl_name = os.path.split(encodeFilename(info['filepath']))
-        finaldir = info.get('__finaldir', dl_path)
-        finalpath = os.path.join(finaldir, dl_name)
-        if self._downloaded:
-            info['__files_to_move'][info['filepath']] = decodeFilename(finalpath)
+        # This represents the main media file (using the 'filepath' key)
+        self.move_file_and_write_to_info(info)

-        make_newfilename = lambda old: decodeFilename(os.path.join(finaldir, os.path.basename(encodeFilename(old))))
-        for oldfile, newfile in info['__files_to_move'].items():
-            if not newfile:
-                newfile = make_newfilename(oldfile)
-            if os.path.abspath(encodeFilename(oldfile)) == os.path.abspath(encodeFilename(newfile)):
+        for key, output_file_type in self.CHILD_KEYS.items():
+            if key not in info:
                continue
-            if not os.path.exists(encodeFilename(oldfile)):
-                self.report_warning('File "%s" cannot be found' % oldfile)
-                continue
-            if os.path.exists(encodeFilename(newfile)):
-                if self.get_param('overwrites', True):
-                    self.report_warning('Replacing existing file "%s"' % newfile)
-                    os.remove(encodeFilename(newfile))
-                else:
-                    self.report_warning(
-                        'Cannot move file "%s" out of temporary directory since "%s" already exists. '
-                        % (oldfile, newfile))
-                    continue
-            make_dir(newfile, PostProcessingError)
-            self.to_screen(f'Moving file "{oldfile}" to "{newfile}"')
-            shutil.move(oldfile, newfile)  # os.rename cannot move between volumes

-        info['filepath'] = finalpath
+            if isinstance(info[key], list) or isinstance(info[key], dict):
+                iterable = info[key].values() if isinstance(info[key], dict) else info[key]
+
+                for file_dict in iterable:
+                    self.move_file_and_write_to_info(info, file_dict, output_file_type)
+
        return [], info
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@ -2092,7 +2092,9 @@ def prepend_extension(filename, ext, expected_real_ext=None):

 def replace_extension(filename, ext, expected_real_ext=None):
    name, real_ext = os.path.splitext(filename)
-    return '{}.{}'.format(
+    ext = ext if ext.startswith('.') else '.' + ext
+
+    return '{}{}'.format(
        name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
        ext)
Author	SHA1	Message	Date
Kieran	3462aff04f	Merge `6c8ede8188` into `8e15177b41`	2024-05-17 21:31:51 +05:30
Justin Keogh	8e15177b41	[ie/youtube] Fix comments extraction (#9775 ) Closes #9358 Authored by: jakeogh, minamotorin, shoxie007, bbilly1 Co-authored-by: minamotorin <76122224+minamotorin@users.noreply.github.com> Co-authored-by: shoxie007 <74592022+shoxie007@users.noreply.github.com> Co-authored-by: Simon <35427372+bbilly1@users.noreply.github.com>	2024-05-17 14:37:30 +00:00
Roeniss Moon	dd9ad97b1f	[cookies] Add `--cookies-from-browser` support for Whale (#9649 ) Closes #9307 Authored by: roeniss	2024-05-17 14:33:12 +00:00
minamotorin	61b17437dc	[ie] Add POST data hash to `--write-pages` filenames (#9879 ) Closes #9773 Authored by: minamotorin	2024-05-17 14:28:36 +00:00
kylegustavo	7975ddf245	[ie/bbc] Fix and extend extraction (#9705 ) Closes #9701 Authored by: kylegustavo, dirkf, pukkandan	2024-05-17 06:20:13 +00:00
Kieran Eglin	6c8ede8188	Fixed embedding filepath issue for subs and infojson	2024-04-26 16:16:56 -07:00
Kieran Eglin	3046c17822	Fixed filepath bug when embedding thumbnails	2024-04-26 15:51:37 -07:00
Kieran Eglin	dd986a4149	Linter	2024-04-26 15:31:22 -07:00
Kieran Eglin	c3fccc58cf	Updated logic for determining file extensions	2024-04-26 15:26:26 -07:00
Kieran Eglin	28d5051546	Reverted pre/post_process function signature	2024-04-26 14:17:18 -07:00
Kieran Eglin	a1ff1d4272	Reverted unrelated changes	2024-04-26 13:55:07 -07:00
Kieran Eglin	0a3c5aceb5	Removed now-unneeded thumbnail/subtitle return values	2024-04-24 11:08:21 -07:00
Kieran Eglin	9c3b227db8	Removed files_to_move logic	2024-04-24 11:02:26 -07:00
Kieran Eglin	ea2a085397	Fixed up tests and linting	2024-04-24 10:57:39 -07:00
Kieran Eglin	44bb6c2056	[WIP] got refactor of file mover basically working	2024-04-24 09:47:58 -07:00
Kieran Eglin	fe4a15ff75	First pass at test feedback	2024-04-23 11:54:38 -07:00
Kieran Eglin	5d51ddbbfc	Removed unneeded conditionals + return	2024-04-23 11:04:46 -07:00
Kieran Eglin	c9d8184fe6	Ran flake8	2024-04-23 11:00:15 -07:00
Kieran Eglin	c574be85f1	Refactored MoveFilesPP to respect non-video files	2024-04-23 10:22:36 -07:00