diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 1c5743604d..9d963ee46e 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -744,7 +744,10 @@ from .mdr import MDRIE from .medaltv import MedalTVIE from .mediaite import MediaiteIE from .mediaklikk import MediaKlikkIE -from .mediaset import MediasetIE +from .mediaset import ( + MediasetIE, + MediasetShowIE, +) from .mediasite import ( MediasiteIE, MediasiteCatalogIE, diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py index 26e7abc493..119b39997a 100644 --- a/yt_dlp/extractor/mediaset.py +++ b/yt_dlp/extractor/mediaset.py @@ -1,13 +1,17 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import re from .theplatform import ThePlatformBaseIE from ..utils import ( ExtractorError, int_or_none, + OnDemandPagedList, parse_qs, + try_get, + urljoin, update_url_query, ) @@ -212,3 +216,81 @@ class MediasetIE(ThePlatformBaseIE): 'subtitles': subtitles, }) return info + + +class MediasetShowIE(MediasetIE): + _VALID_URL = r'''(?x) + (?: + https?:// + (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ + (?: + (?:fiction|programmi-tv|serie-tv)/(?:.+?/)? + (?:[a-z]+)_SE(?P\d{12}) + (?:,ST(?P\d{12}))? + (?:,sb(?P\d{9}))?$ + ) + ) + ''' + _TESTS = [{ + # TV Show webpage (with a single playlist) + 'url': 'https://www.mediasetplay.mediaset.it/serie-tv/fireforce/episodi_SE000000001556', + 'info_dict': { + 'id': '000000001556', + 'title': 'Fire Force', + }, + 'playlist_count': 1, + }, { + # TV Show webpage (with multiple playlists) + 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/leiene_SE000000000061,ST000000002763', + 'info_dict': { + 'id': '000000002763', + 'title': 'Le Iene', + }, + 'playlist_count': 7, + }, { + # TV Show specific playlist (single page) + 'url': 'https://www.mediasetplay.mediaset.it/serie-tv/fireforce/episodi_SE000000001556,ST000000002738,sb100013107', + 'info_dict': { + 'id': '100013107', + 'title': 'Episodi', + }, + 'playlist_count': 4, + }, { + # TV Show specific playlist (with multiple pages) + 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/iservizi_SE000000000061,ST000000002763,sb100013375', + 'info_dict': { + 'id': '100013375', + 'title': 'I servizi', + }, + 'playlist_count': 53, + }] + + _BY_SUBBRAND = 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs-v2?byCustomValue={subBrandId}{%s}&sort=:publishInfo_lastPublished|desc,tvSeasonEpisodeNumber|desc&range=%d-%d' + _PAGE_SIZE = 25 + + def _fetch_page(self, sb, page): + lower_limit = page * self._PAGE_SIZE + 1 + upper_limit = lower_limit + self._PAGE_SIZE - 1 + content = self._download_json( + self._BY_SUBBRAND % (sb, lower_limit, upper_limit), sb) + for entry in content.get('entries') or []: + yield self.url_result( + 'mediaset:' + entry['guid'], + playlist_title=entry['mediasetprogram$subBrandDescription']) + + def _real_extract(self, url): + playlist_id, st, sb = self._match_valid_url(url).group('id', 'st', 'sb') + if not sb: + page = self._download_webpage(url, playlist_id) + entries = [self.url_result(urljoin('https://www.mediasetplay.mediaset.it', url)) + for url in re.findall(r'href="([^<>=]+SE\d{12},ST\d{12},sb\d{9})">[^<]+<', page)] + title = (self._html_search_regex(r'(?s)]*>(.+?)', page, 'title', default=None) + or self._og_search_title(page)) + return self.playlist_result(entries, st or playlist_id, title) + + entries = OnDemandPagedList( + functools.partial(self._fetch_page, sb), + self._PAGE_SIZE) + title = try_get(entries, lambda x: x[0]['playlist_title']) + + return self.playlist_result(entries, sb, title)