From 00766ece0c5c7a80781a4ff677198c5fb69d9dc0 Mon Sep 17 00:00:00 2001 From: Sean Ellingham Date: Sat, 6 Jul 2024 00:02:35 +0100 Subject: [PATCH] [ie/vidyard] Add extractor (#10155) Closes #4618 Authored by: exterrestris --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/cellebrite.py | 69 +++--- yt_dlp/extractor/swearnet.py | 64 ++--- yt_dlp/extractor/vidyard.py | 426 ++++++++++++++++++++++++++++++++ 4 files changed, 470 insertions(+), 90 deletions(-) create mode 100644 yt_dlp/extractor/vidyard.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 7f6507def..34dea79ef 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2324,6 +2324,7 @@ ) from .vidlii import VidLiiIE from .vidly import VidlyIE +from .vidyard import VidyardIE from .viewlift import ( ViewLiftEmbedIE, ViewLiftIE, diff --git a/yt_dlp/extractor/cellebrite.py b/yt_dlp/extractor/cellebrite.py index e90365a8b..54367c4d5 100644 --- a/yt_dlp/extractor/cellebrite.py +++ b/yt_dlp/extractor/cellebrite.py @@ -1,63 +1,50 @@ -from .common import InfoExtractor -from ..utils import traverse_obj +from .vidyard import VidyardBaseIE, VidyardIE +from ..utils import ExtractorError, make_archive_id, url_basename -class CellebriteIE(InfoExtractor): +class CellebriteIE(VidyardBaseIE): _VALID_URL = r'https?://cellebrite\.com/(?:\w+)?/(?P[\w-]+)' _TESTS = [{ 'url': 'https://cellebrite.com/en/collect-data-from-android-devices-with-cellebrite-ufed/', 'info_dict': { - 'id': '16025876', + 'id': 'ZqmUss3dQfEMGpauambPuH', + 'display_id': '16025876', 'ext': 'mp4', - 'description': 'md5:174571cb97083fd1d457d75c684f4e2b', - 'thumbnail': 'https://cellebrite.com/wp-content/uploads/2021/05/Chat-Capture-1024x559.png', 'title': 'Ask the Expert: Chat Capture - Collect Data from Android Devices in Cellebrite UFED', - 'duration': 455, - 'tags': [], + 'description': 'md5:dee48fe12bbae5c01fe6a053f7676da4', + 'thumbnail': 'https://cellebrite.com/wp-content/uploads/2021/05/Chat-Capture-1024x559.png', + 'duration': 455.979, + '_old_archive_ids': ['cellebrite 16025876'], }, }, { 'url': 'https://cellebrite.com/en/how-to-lawfully-collect-the-maximum-amount-of-data-from-android-devices/', 'info_dict': { - 'id': '29018255', + 'id': 'QV1U8a2yzcxigw7VFnqKyg', + 'display_id': '29018255', 'ext': 'mp4', - 'duration': 134, - 'tags': [], - 'description': 'md5:e9a3d124c7287b0b07bad2547061cacf', + 'title': 'How to Lawfully Collect the Maximum Amount of Data From Android Devices', + 'description': 'md5:0e943a9ac14c374d5d74faed634d773c', 'thumbnail': 'https://cellebrite.com/wp-content/uploads/2022/07/How-to-Lawfully-Collect-the-Maximum-Amount-of-Data-From-Android-Devices.png', - 'title': 'Android Extractions Explained', + 'duration': 134.315, + '_old_archive_ids': ['cellebrite 29018255'], }, }] - def _get_formats_and_subtitles(self, json_data, display_id): - formats = [{'url': url} for url in traverse_obj(json_data, ('mp4', ..., 'url')) or []] - subtitles = {} - - for url in traverse_obj(json_data, ('hls', ..., 'url')) or []: - fmt, sub = self._extract_m3u8_formats_and_subtitles( - url, display_id, ext='mp4', headers={'Referer': 'https://play.vidyard.com/'}) - formats.extend(fmt) - self._merge_subtitles(sub, target=subtitles) - - return formats, subtitles - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + slug = self._match_id(url) + webpage = self._download_webpage(url, slug) + vidyard_url = next(VidyardIE._extract_embed_urls(url, webpage), None) + if not vidyard_url: + raise ExtractorError('No Vidyard video embeds found on page') - player_uuid = self._search_regex( - r']*\bdata-uuid\s*=\s*"([^"\?]+)', webpage, 'player UUID') - json_data = self._download_json( - f'https://play.vidyard.com/player/{player_uuid}.json', display_id)['payload']['chapters'][0] + video_id = url_basename(vidyard_url) + info = self._process_video_json(self._fetch_video_json(video_id)['chapters'][0], video_id) + if info.get('display_id'): + info['_old_archive_ids'] = [make_archive_id(self, info['display_id'])] + if thumbnail := self._og_search_thumbnail(webpage, default=None): + info.setdefault('thumbnails', []).append({'url': thumbnail}) - formats, subtitles = self._get_formats_and_subtitles(json_data['sources'], display_id) return { - 'id': str(json_data['videoId']), - 'title': json_data.get('name') or self._og_search_title(webpage), - 'formats': formats, - 'subtitles': subtitles, - 'description': json_data.get('description') or self._og_search_description(webpage), - 'duration': json_data.get('seconds'), - 'tags': json_data.get('tags'), - 'thumbnail': self._og_search_thumbnail(webpage), - 'http_headers': {'Referer': 'https://play.vidyard.com/'}, + 'description': self._og_search_description(webpage, default=None), + **info, } diff --git a/yt_dlp/extractor/swearnet.py b/yt_dlp/extractor/swearnet.py index b4835c5ad..2d6fb3eb4 100644 --- a/yt_dlp/extractor/swearnet.py +++ b/yt_dlp/extractor/swearnet.py @@ -1,55 +1,31 @@ -from .common import InfoExtractor -from ..utils import ExtractorError, int_or_none, traverse_obj +from .vidyard import VidyardBaseIE +from ..utils import ExtractorError, int_or_none, make_archive_id -class SwearnetEpisodeIE(InfoExtractor): +class SwearnetEpisodeIE(VidyardBaseIE): _VALID_URL = r'https?://www\.swearnet\.com/shows/(?P[\w-]+)/seasons/(?P\d+)/episodes/(?P\d+)' _TESTS = [{ 'url': 'https://www.swearnet.com/shows/gettin-learnt-with-ricky/seasons/1/episodes/1', 'info_dict': { - 'id': '232819', + 'id': 'wicK2EOzjOdxkUXGDIgcPw', + 'display_id': '232819', 'ext': 'mp4', 'episode_number': 1, 'episode': 'Episode 1', 'duration': 719, - 'description': 'md5:c48ef71440ce466284c07085cd7bd761', + 'description': r're:Are you drunk and high and craving a grilled cheese sandwich.+', 'season': 'Season 1', 'title': 'Episode 1 - Grilled Cheese Sammich', 'season_number': 1, - 'thumbnail': 'https://cdn.vidyard.com/thumbnails/232819/_RX04IKIq60a2V6rIRqq_Q_small.jpg', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/custom/0dd74f9b-388a-452e-b570-b407fb64435b_small.jpg', + 'tags': ['Getting Learnt with Ricky', 'drunk', 'grilled cheese', 'high'], + '_old_archive_ids': ['swearnetepisode 232819'], }, }] - def _get_formats_and_subtitle(self, video_source, video_id): - video_source = video_source or {} - formats, subtitles = [], {} - for key, value in video_source.items(): - if key == 'hls': - for video_hls in value: - fmts, subs = self._extract_m3u8_formats_and_subtitles(video_hls.get('url'), video_id) - formats.extend(fmts) - self._merge_subtitles(subs, target=subtitles) - else: - formats.extend({ - 'url': video_mp4.get('url'), - 'ext': 'mp4', - } for video_mp4 in value) - - return formats, subtitles - - def _get_direct_subtitle(self, caption_json): - subs = {} - for caption in caption_json: - subs.setdefault(caption.get('language') or 'und', []).append({ - 'url': caption.get('vttUrl'), - 'name': caption.get('name'), - }) - - return subs - def _real_extract(self, url): - display_id, season_number, episode_number = self._match_valid_url(url).group('id', 'season_num', 'episode_num') - webpage = self._download_webpage(url, display_id) + slug, season_number, episode_number = self._match_valid_url(url).group('id', 'season_num', 'episode_num') + webpage = self._download_webpage(url, slug) try: external_id = self._search_regex(r'externalid\s*=\s*"([^"]+)', webpage, 'externalid') @@ -58,22 +34,12 @@ def _real_extract(self, url): self.raise_login_required() raise - json_data = self._download_json( - f'https://play.vidyard.com/player/{external_id}.json', display_id)['payload']['chapters'][0] - - formats, subtitles = self._get_formats_and_subtitle(json_data['sources'], display_id) - self._merge_subtitles(self._get_direct_subtitle(json_data.get('captions')), target=subtitles) + info = self._process_video_json(self._fetch_video_json(external_id)['chapters'][0], external_id) + if info.get('display_id'): + info['_old_archive_ids'] = [make_archive_id(self, info['display_id'])] return { - 'id': str(json_data['videoId']), - 'title': json_data.get('name') or self._html_search_meta(['og:title', 'twitter:title'], webpage), - 'description': (json_data.get('description') - or self._html_search_meta(['og:description', 'twitter:description'], webpage)), - 'duration': int_or_none(json_data.get('seconds')), - 'formats': formats, - 'subtitles': subtitles, + **info, 'season_number': int_or_none(season_number), 'episode_number': int_or_none(episode_number), - 'thumbnails': [{'url': thumbnail_url} - for thumbnail_url in traverse_obj(json_data, ('thumbnailUrls', ...))], } diff --git a/yt_dlp/extractor/vidyard.py b/yt_dlp/extractor/vidyard.py new file mode 100644 index 000000000..20a54b161 --- /dev/null +++ b/yt_dlp/extractor/vidyard.py @@ -0,0 +1,426 @@ +import functools +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + float_or_none, + int_or_none, + join_nonempty, + mimetype2ext, + parse_resolution, + str_or_none, + unescapeHTML, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class VidyardBaseIE(InfoExtractor): + _HEADERS = {'Referer': 'https://play.vidyard.com/'} + + def _get_formats_and_subtitles(self, sources, video_id): + formats, subtitles = [], {} + + def add_hls_fmts_and_subs(m3u8_url): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, 'mp4', m3u8_id='hls', headers=self._HEADERS, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + hls_list = isinstance(sources, dict) and sources.pop('hls', None) + if master_m3u8_url := traverse_obj( + hls_list, (lambda _, v: v['profile'] == 'auto', 'url', {url_or_none}, any)): + add_hls_fmts_and_subs(master_m3u8_url) + if not formats: # These are duplicate and unnecesary requests if we got 'auto' hls fmts + for variant_m3u8_url in traverse_obj(hls_list, (..., 'url', {url_or_none})): + add_hls_fmts_and_subs(variant_m3u8_url) + + for source_type, source_list in traverse_obj(sources, ({dict.items}, ...)): + for source in traverse_obj(source_list, lambda _, v: url_or_none(v['url'])): + profile = source.get('profile') + formats.append({ + 'url': source['url'], + 'ext': mimetype2ext(source.get('mimeType'), default=None), + 'format_id': join_nonempty('http', source_type, profile), + **parse_resolution(profile), + }) + + self._remove_duplicate_formats(formats) + return formats, subtitles + + def _get_direct_subtitles(self, caption_json): + subs = {} + for caption in traverse_obj(caption_json, lambda _, v: url_or_none(v['vttUrl'])): + subs.setdefault(caption.get('language') or 'und', []).append({ + 'url': caption['vttUrl'], + 'name': caption.get('name'), + }) + + return subs + + def _fetch_video_json(self, video_id): + return self._download_json( + f'https://play.vidyard.com/player/{video_id}.json', video_id)['payload'] + + def _process_video_json(self, json_data, video_id): + formats, subtitles = self._get_formats_and_subtitles(json_data['sources'], video_id) + self._merge_subtitles(self._get_direct_subtitles(json_data.get('captions')), target=subtitles) + + return { + **traverse_obj(json_data, { + 'id': ('facadeUuid', {str}), + 'display_id': ('videoId', {int}, {str_or_none}), + 'title': ('name', {str}), + 'description': ('description', {str}, {unescapeHTML}, {lambda x: x or None}), + 'duration': (( + ('milliseconds', {functools.partial(float_or_none, scale=1000)}), + ('seconds', {int_or_none})), any), + 'thumbnails': ('thumbnailUrls', ('small', 'normal'), {'url': {url_or_none}}), + 'tags': ('tags', ..., 'name', {str}), + }), + 'formats': formats, + 'subtitles': subtitles, + 'http_headers': self._HEADERS, + } + + +class VidyardIE(VidyardBaseIE): + _VALID_URL = [ + r'https?://[\w-]+(?:\.hubs)?\.vidyard\.com/watch/(?P[\w-]+)', + r'https?://(?:embed|share)\.vidyard\.com/share/(?P[\w-]+)', + r'https?://play\.vidyard\.com/(?:player/)?(?P[\w-]+)', + ] + _EMBED_REGEX = [r']* src=["\'](?P(?:https?:)?//play\.vidyard\.com/[\w-]+)'] + _TESTS = [{ + 'url': 'https://vyexample03.hubs.vidyard.com/watch/oTDMPlUv--51Th455G5u7Q', + 'info_dict': { + 'id': 'oTDMPlUv--51Th455G5u7Q', + 'display_id': '50347', + 'ext': 'mp4', + 'title': 'Homepage Video', + 'description': 'Look I changed the description.', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/50347/OUPa5LTKV46849sLYngMqQ_small.jpg', + 'duration': 99, + 'tags': ['these', 'are', 'all', 'tags'], + }, + }, { + 'url': 'https://share.vidyard.com/watch/PaQzDAT1h8JqB8ivEu2j6Y?', + 'info_dict': { + 'id': 'PaQzDAT1h8JqB8ivEu2j6Y', + 'display_id': '9281024', + 'ext': 'mp4', + 'title': 'Inline Embed', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/spacer.gif', + 'duration': 41.186, + }, + }, { + 'url': 'https://embed.vidyard.com/share/oTDMPlUv--51Th455G5u7Q', + 'info_dict': { + 'id': 'oTDMPlUv--51Th455G5u7Q', + 'display_id': '50347', + 'ext': 'mp4', + 'title': 'Homepage Video', + 'description': 'Look I changed the description.', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/50347/OUPa5LTKV46849sLYngMqQ_small.jpg', + 'duration': 99, + 'tags': ['these', 'are', 'all', 'tags'], + }, + }, { + # First video from playlist below + 'url': 'https://embed.vidyard.com/share/SyStyHtYujcBHe5PkZc5DL', + 'info_dict': { + 'id': 'SyStyHtYujcBHe5PkZc5DL', + 'display_id': '41974005', + 'ext': 'mp4', + 'title': 'Prepare the Frame and Track for Palm Beach Polysatin Shutters With BiFold Track', + 'description': r're:In this video, you will learn how to prepare the frame.+', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/41974005/IJw7oCaJcF1h7WWu3OVZ8A_small.png', + 'duration': 258.666, + }, + }, { + # Playlist + 'url': 'https://thelink.hubs.vidyard.com/watch/pwu7pCYWSwAnPxs8nDoFrE', + 'info_dict': { + 'id': 'pwu7pCYWSwAnPxs8nDoFrE', + 'title': 'PLAYLIST - Palm Beach Shutters- Bi-Fold Track System Installation', + 'entries': [{ + 'id': 'SyStyHtYujcBHe5PkZc5DL', + 'display_id': '41974005', + 'ext': 'mp4', + 'title': 'Prepare the Frame and Track for Palm Beach Polysatin Shutters With BiFold Track', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/41974005/IJw7oCaJcF1h7WWu3OVZ8A_small.png', + 'duration': 258.666, + }, { + 'id': '1Fw4B84jZTXLXWqkE71RiM', + 'display_id': '5861113', + 'ext': 'mp4', + 'title': 'Palm Beach - Bi-Fold Track System "Frame Installation"', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5861113/29CJ54s5g1_aP38zkKLHew_small.jpg', + 'duration': 167.858, + }, { + 'id': 'DqP3wBvLXSpxrcqpT5kEeo', + 'display_id': '41976334', + 'ext': 'mp4', + 'title': 'Install the Track for Palm Beach Polysatin Shutters With BiFold Track', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5861090/RwG2VaTylUa6KhSTED1r1Q_small.png', + 'duration': 94.229, + }, { + 'id': 'opfybfxpzQArxqtQYB6oBU', + 'display_id': '41976364', + 'ext': 'mp4', + 'title': 'Install the Panel for Palm Beach Polysatin Shutters With BiFold Track', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5860926/JIOaJR08dM4QgXi_iQ2zGA_small.png', + 'duration': 191.467, + }, { + 'id': 'rWrXvkbTNNaNqD6189HJya', + 'display_id': '41976382', + 'ext': 'mp4', + 'title': 'Adjust the Panels for Palm Beach Polysatin Shutters With BiFold Track', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5860687/CwHxBv4UudAhOh43FVB4tw_small.png', + 'duration': 138.155, + }, { + 'id': 'eYPTB521MZ9TPEArSethQ5', + 'display_id': '41976409', + 'ext': 'mp4', + 'title': 'Assemble and Install the Valance for Palm Beach Polysatin Shutters With BiFold Track', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5861425/0y68qlMU4O5VKU7bJ8i_AA_small.png', + 'duration': 148.224, + }], + }, + 'playlist_count': 6, + }, { + # Non hubs.vidyard.com playlist + 'url': 'https://salesforce.vidyard.com/watch/d4vqPjs7Q5EzVEis5QT3jd', + 'info_dict': { + 'id': 'd4vqPjs7Q5EzVEis5QT3jd', + 'title': 'How To: Service Cloud: Import External Content in Lightning Knowledge', + 'entries': [{ + 'id': 'mcjDpSZir2iSttbvFkx6Rv', + 'display_id': '29479036', + 'ext': 'mp4', + 'title': 'Welcome to this Expert Coaching Series', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/ouyQi9WuwyiOupChUWNmjQ/7170d3485ba602e012df05_small.jpg', + 'duration': 38.205, + }, { + 'id': '84bPYwpg243G6xYEfJdYw9', + 'display_id': '21820704', + 'ext': 'mp4', + 'title': 'Chapter 1 - Title + Agenda', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/HFPN0ZgQq4Ow8BghGcQSow/bfaa30123c8f6601e7d7f2_small.jpg', + 'duration': 98.016, + }, { + 'id': 'nP17fMuvA66buVHUrzqjTi', + 'display_id': '21820707', + 'ext': 'mp4', + 'title': 'Chapter 2 - Import Options', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/rGRIF5nFjPI9OOA2qJ_Dbg/86a8d02bfec9a566845dd4_small.jpg', + 'duration': 199.136, + }, { + 'id': 'm54EcwXdpA5gDBH5rgCYoV', + 'display_id': '21820710', + 'ext': 'mp4', + 'title': 'Chapter 3 - Importing Article Translations', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/IVX4XR8zpSsiNIHx45kz-A/1ccbf8a29a33856d06b3ed_small.jpg', + 'duration': 184.352, + }, { + 'id': 'j4nzS42oq4hE9oRV73w3eQ', + 'display_id': '21820716', + 'ext': 'mp4', + 'title': 'Chapter 4 - Best Practices', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/BtrRrQpRDLbA4AT95YQyog/1f1e6b8e7fdc3fa95ec8d3_small.jpg', + 'duration': 296.960, + }, { + 'id': 'y28PYfW5pftvers9PXzisC', + 'display_id': '21820727', + 'ext': 'mp4', + 'title': 'Chapter 5 - Migration Steps', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/K2CdQOXDfLcrVTF60r0bdw/a09239ada28b6ffce12b1f_small.jpg', + 'duration': 620.640, + }, { + 'id': 'YWU1eQxYvhj29SjYoPw5jH', + 'display_id': '21820733', + 'ext': 'mp4', + 'title': 'Chapter 6 - Demo', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/rsmhP-cO8dAa8ilvFGCX0g/7911ef415167cd14032068_small.jpg', + 'duration': 631.456, + }, { + 'id': 'nmEvVqpwdJUgb74zKsLGxn', + 'display_id': '29479037', + 'ext': 'mp4', + 'title': 'Schedule Your Follow-Up', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/Rtwc7X4PEkF4Ae5kHi-Jvw/174ebed3f34227b1ffa1d0_small.jpg', + 'duration': 33.608, + }], + }, + 'playlist_count': 8, + }, { + # URL of iframe embed src + 'url': 'https://play.vidyard.com/iDqTwWGrd36vaLuaCY3nTs.html', + 'info_dict': { + 'id': 'iDqTwWGrd36vaLuaCY3nTs', + 'display_id': '9281009', + 'ext': 'mp4', + 'title': 'Lightbox Embed', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/spacer.gif', + 'duration': 39.035, + }, + }, { + # Player JSON URL + 'url': 'https://play.vidyard.com/player/7GAApnNNbcZZ46k6JqJQSh.json?disable_analytics=0', + 'info_dict': { + 'id': '7GAApnNNbcZZ46k6JqJQSh', + 'display_id': '820026', + 'ext': 'mp4', + 'title': 'The Art of Storytelling: How to Deliver Your Brand Story with Content & Social', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/MhbE-5sEFQu4x3fI6FkNlA/41eb5717c557cd19456910_small.jpg', + 'duration': 2153.013, + 'tags': ['Summit2017'], + }, + }, { + 'url': 'http://share.vidyard.com/share/diYeo6YR2yiGgL8odvS8Ri', + 'only_matching': True, + }, { + 'url': 'https://play.vidyard.com/FFlz3ZpxhIfKQ1fd9DAryA', + 'only_matching': True, + }, { + 'url': 'https://play.vidyard.com/qhMAu5A76GZVrFzOPgSf9A/type/standalone', + 'only_matching': True, + }] + _WEBPAGE_TESTS = [{ + # URL containing inline/lightbox embedded video + 'url': 'https://resources.altium.com/p/2-the-extreme-importance-of-pc-board-stack-up', + 'info_dict': { + 'id': 'GDx1oXrFWj4XHbipfoXaMn', + 'display_id': '3225198', + 'ext': 'mp4', + 'title': 'The Extreme Importance of PC Board Stack Up', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/73_Q3_hBexWX7Og1sae6cg/9998fa4faec921439e2c04_small.jpg', + 'duration': 3422.742, + }, + }, { + #