From 29f400b97d00cc7bd8b1a7549417584a45e38df8 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Thu, 4 Dec 2014 02:54:25 +0100 Subject: [PATCH 1/5] [tvp] Update extractor --- youtube_dl/extractor/tvp.py | 121 +++++++++++++++++++++++++++++------- 1 file changed, 98 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index a645800057..6b95e2ed11 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -1,37 +1,112 @@ +# -*- coding: utf-8 -*- from __future__ import unicode_literals +import re + from .common import InfoExtractor class TvpIE(InfoExtractor): IE_NAME = 'tvp.pl' - _VALID_URL = r'https?://www\.tvp\.pl/.*?wideo/(?P\d+)/(?P\d+)' + _VALID_URL = r'https?://(?Pvod|www)\.tvp\.pl/.*/(?P\d+)$' - _TEST = { - 'url': 'http://www.tvp.pl/warszawa/magazyny/campusnews/wideo/31102013/12878238', - 'md5': '148408967a6a468953c0a75cbdaf0d7a', - 'info_dict': { - 'id': '12878238', - 'ext': 'wmv', - 'title': '31.10.2013 - Odcinek 2', - 'description': '31.10.2013 - Odcinek 2', + _TESTS = [ + { + 'url': 'http://www.tvp.pl/warszawa/magazyny/campusnews/wideo/31102013/12878238', + 'info_dict': { + 'id': '12878238', + 'ext': 'wmv', + 'title': 'CAMPUSnews, 31.10.2013 - Odcinek 2', + 'description': '', + }, + 'skip': 'Download has to use same server IP as extraction. Therefore, a good (load-balancing) DNS resolver will make the download fail.', + }, { + 'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem/wideo/odc-2/4278035', + 'info_dict': { + 'id': '4278035', + 'ext': 'wmv', + 'title': 'Ogniem i mieczem, odc. 2', + 'description': 'Bohun dowiaduje się o złamaniu przez kniahinię danego mu słowa i wyrusza do Rozłogów. Helenie w ostatniej chwili udaje się uciec dzięki pomocy Zagłoby.', + }, + 'skip': 'As above', + }, { + 'url': 'http://vod.tvp.pl/seriale/obyczajowe/czas-honoru/sezon-1-1-13/i-seria-odc-13/194536', + 'info_dict': { + 'id': '194536', + 'ext': 'mp4', + 'title': 'Czas honoru, I seria – odc. 13', + 'description': 'WŁADEK\nCzesław prosi Marię o dostarczenie Władkowi zarazki tyfusu. Jeśli zachoruje zostanie przewieziony do szpitala skąd łatwiej będzie go odbić. Czy matka zdecyduje się zarazić syna? Karol odwiedza Wandę przyznaje się, że ją oszukiwał, ale ostrzega też, że grozi jej aresztowanie i nalega, żeby wyjechała z Warszawy. Czy dziewczyna zdecyduje się znów oddalić od ukochanego? Rozpoczyna się akcja odbicia Władka.', + }, + }, { + 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', + 'info_dict': { + 'id': '17916176', + 'ext': 'mp4', + 'title': 'rozmaitosci, TVP Gorzów pokaże filmy studentów z podroży dookoła świata', + 'description': '', + }, + 'params': { + # m3u8 download + 'skip_download': 'true', + }, + }, { + 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', + 'info_dict': { + 'id': '17834272', + 'ext': 'mp4', + 'title': 'Na sygnale, odc. 39', + 'description': 'Ekipa Wiktora ratuje młodą matkę, która spadła ze schodów trzymając na rękach noworodka. Okazuje się, że dziewczyna jest surogatką, a biologiczni rodzice dziecka próbują zmusić ją do oddania synka…', + }, + 'params': { + # m3u8 download + 'skip_download': 'true', + }, }, - 'skip': 'Download has to use same server IP as extraction. Therefore, a good (load-balancing) DNS resolver will make the download fail.' - } + ] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - json_url = 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id - params = self._download_json( - json_url, video_id, "Downloading video metadata") - video_url = params['video_url'] - - return { + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage( + 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) + title = self._og_search_title(webpage) + series = self._search_regex( + r'{name:\s*([\'"])SeriesTitle\1,\s*value:\s*\1(?P.*?)\1},', + webpage, 'series', group='series', default=None) + if series is not None and series not in title: + title = '%s, %s' % (series, title) + info_dict = { 'id': video_id, - 'title': self._og_search_title(webpage), - 'ext': 'wmv', - 'url': video_url, - 'description': self._og_search_description(webpage), + 'title': title, 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage, default=''), } + if mobj.group('type') == 'vod' and info_dict['description'] == '': + info_dict.update({ + 'description': self._html_search_regex( + r'(?s)', + self._download_webpage(url, video_id), 'description', group=0), + }) + + video_url = self._search_regex( + r'0:{src:([\'"])(?P.*?)\1', webpage, 'formats', group='url', default=None) + if video_url is None: + video_url = self._download_json( + 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id, + video_id)['video_url'] + + ext = video_url.rsplit('.', 1)[-1] + if ext != 'ism/manifest': + if '/' in ext: + ext = 'mp4' + info_dict.update({ + 'ext': ext, + 'url': video_url, + }) + else: + m3u8_url = re.sub('([^/]*)\.ism/manifest', r'\1.ism/\1.m3u8', video_url) + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + info_dict.update({ + 'formats': formats, + }) + return info_dict From 6ce2c6783b2c1516fe9b1b5cb88d28231db45f8c Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Thu, 4 Dec 2014 05:14:09 +0100 Subject: [PATCH 2/5] [tvp] Add extractor --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/tvp.py | 56 ++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 8b513ffd1d..b09ee303d4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -415,7 +415,7 @@ from .tunein import TuneInIE from .turbo import TurboIE from .tutv import TutvIE from .tvigle import TvigleIE -from .tvp import TvpIE +from .tvp import TvpIE, TvpSeriesIE from .tvplay import TVPlayIE from .twentyfourvideo import TwentyFourVideoIE from .twitch import TwitchIE diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 6b95e2ed11..2248a9fdff 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -110,3 +110,59 @@ class TvpIE(InfoExtractor): 'formats': formats, }) return info_dict + + +class TvpSeriesIE(InfoExtractor): + IE_NAME = 'tvp.pl:Series' + _VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P[^/]+)/?$' + + _TESTS = [ + { + 'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem', + 'info_dict': { + 'title': 'Ogniem i mieczem', + 'id': '4278026', + }, + 'playlist_count': 4, + }, { + 'url': 'http://vod.tvp.pl/audycje/podroze/boso-przez-swiat', + 'info_dict': { + 'title': 'Boso przez świat', + 'id': '9329207', + }, + 'playlist_count': 86, + } + ] + + def _force_download_webpage(self, url, v_id, tries=0): + if tries >= 5: + raise ExtractorError( + '%s: Cannot download webpage, try again later' % v_id) + # Sometimes happen, but in my tests second try always succeeded + try: + return self._download_webpage(url, v_id) + except IncompleteRead as e: + return self._force_download_webpage(url, v_id, tries+1) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._force_download_webpage(url, display_id) + title = self._html_search_regex( + r'(?s) id=[\'"]path[\'"]>(.*?)', webpage, 'series') + title = title.split(' / ', 2)[-1] + playlist_id = self._search_regex(r'nodeId:\s*(\d+)', webpage, 'playlist id') + playlist = self._force_download_webpage( + 'http://vod.tvp.pl/vod/seriesAjax?type=series&nodeId=%s&recommend' + 'edId=0&sort=&page=0&pageSize=1000000' % playlist_id, display_id) + videos_paths = re.findall( + '(?s)class="shortTitle">.*?href="(/[^"]+)', playlist) + entries = [ + self.url_result('http://vod.tvp.pl%s' % v_path, ie=TvpIE.ie_key()) + for v_path in videos_paths] + return { + '_type': 'playlist', + 'id': playlist_id, + 'display_id': display_id, + 'title': title, + 'entries': entries, + } From 225e4b9633285b66adc914a61d8f55ca125eb91d Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Thu, 4 Dec 2014 14:08:08 +0100 Subject: [PATCH 3/5] [tvp] Remove unnecessary code --- youtube_dl/extractor/tvp.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 2248a9fdff..bd7ee25a0b 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -79,15 +79,8 @@ class TvpIE(InfoExtractor): 'id': video_id, 'title': title, 'thumbnail': self._og_search_thumbnail(webpage), - 'description': self._og_search_description(webpage, default=''), + 'description': self._og_search_description(webpage), } - if mobj.group('type') == 'vod' and info_dict['description'] == '': - info_dict.update({ - 'description': self._html_search_regex( - r'(?s)', - self._download_webpage(url, video_id), 'description', group=0), - }) - video_url = self._search_regex( r'0:{src:([\'"])(?P.*?)\1', webpage, 'formats', group='url', default=None) if video_url is None: From 995ad69c54899a0cfc84fd89083f07919acdbb83 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Thu, 4 Dec 2014 14:11:27 +0100 Subject: [PATCH 4/5] [common] Add new parameters for _download_webpage --- youtube_dl/extractor/common.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e80a2dad0b..b633ea9b92 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -360,9 +360,19 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5): """ Returns the data of the page as a string """ - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) + success = False + try_count = 0 + while success is False: + try: + res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) + success = True + except compat_http_client.IncompleteRead as e: + try_count += 1 + if try_count >= tries: + raise e + self._sleep(timeout, video_id) if res is False: return res else: From 2415951ead0c4996d7892ef859d06970c4595701 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Thu, 4 Dec 2014 14:12:09 +0100 Subject: [PATCH 5/5] [tvp] Modernize --- youtube_dl/extractor/tvp.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index bd7ee25a0b..cf21d12024 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -127,26 +127,15 @@ class TvpSeriesIE(InfoExtractor): } ] - def _force_download_webpage(self, url, v_id, tries=0): - if tries >= 5: - raise ExtractorError( - '%s: Cannot download webpage, try again later' % v_id) - # Sometimes happen, but in my tests second try always succeeded - try: - return self._download_webpage(url, v_id) - except IncompleteRead as e: - return self._force_download_webpage(url, v_id, tries+1) - def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._force_download_webpage(url, display_id) + webpage = self._download_webpage(url, display_id, tries=5) title = self._html_search_regex( - r'(?s) id=[\'"]path[\'"]>(.*?)', webpage, 'series') - title = title.split(' / ', 2)[-1] + r'(?s) id=[\'"]path[\'"]>(?:.*? / ){2}(.*?)', webpage, 'series') playlist_id = self._search_regex(r'nodeId:\s*(\d+)', webpage, 'playlist id') - playlist = self._force_download_webpage( + playlist = self._download_webpage( 'http://vod.tvp.pl/vod/seriesAjax?type=series&nodeId=%s&recommend' - 'edId=0&sort=&page=0&pageSize=1000000' % playlist_id, display_id) + 'edId=0&sort=&page=0&pageSize=10000' % playlist_id, display_id, tries=5) videos_paths = re.findall( '(?s)class="shortTitle">.*?href="(/[^"]+)', playlist) entries = [