[tbs] fix extraction(fixes #13658)

2024-07-18 19:34:46 +02:00 · 2017-12-11 13:38:55 +01:00 · 2017-12-11 13:38:55 +01:00 · b6f78d76c1
commit b6f78d76c1
parent 1fa0dce2c0
2 changed files with 120 additions and 62 deletions
--- a/youtube_dl/extractor/tbs.py
+++ b/youtube_dl/extractor/tbs.py
@ -4,58 +4,110 @@
 import re

 from .turner import TurnerBaseIE
-from ..utils import extract_attributes
+from ..utils import (
+    float_or_none,
+    int_or_none,
+    strip_or_none,
+)


 class TBSIE(TurnerBaseIE):
-    # https://github.com/rg3/youtube-dl/issues/13658
-    _WORKING = False
-
-    _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P<id>[^/?#]+)\.html'
+    _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/(?:movies|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P<id>[^/?#]+)'
    _TESTS = [{
-        'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html',
-        'md5': '9e61d680e2285066ade7199e6408b2ee',
+        'url': 'http://www.tntdrama.com/shows/the-alienist/clips/monster',
        'info_dict': {
-            'id': '2007318',
+            'id': '8d384cde33b89f3a43ce5329de42903ed5099887',
            'ext': 'mp4',
-            'title': 'Theatrical Trailer',
-            'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.',
+            'title': 'Monster',
+            'description': 'Get a first look at the theatrical trailer for TNT’s highly anticipated new psychological thriller The Alienist, which premieres January 22 on TNT.',
+            'timestamp': 1508175329,
+            'upload_date': '20171016',
        },
-        'skip': 'TBS videos are deleted after a while',
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
    }, {
-        'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html',
-        'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56',
-        'info_dict': {
-            'id': '1538823',
-            'ext': 'mp4',
-            'title': 'You Better Run',
-            'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.',
-        },
-        'skip': 'TBS videos are deleted after a while',
+        'url': 'http://www.tbs.com/shows/search-party/season-1/episode-1/explicit-the-mysterious-disappearance-of-the-girl-no-one-knew',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tntdrama.com/movies/star-wars-a-new-hope',
+        'only_matching': True,
    }]

    def _real_extract(self, url):
        domain, display_id = re.match(self._VALID_URL, url).groups()
        site = domain[:3]
        webpage = self._download_webpage(url, display_id)
-        video_params = extract_attributes(self._search_regex(r'(<[^>]+id="page-video"[^>]*>)', webpage, 'video params'))
-        query = None
-        clip_id = video_params.get('clipid')
-        if clip_id:
-            query = 'id=' + clip_id
-        else:
-            query = 'titleId=' + video_params['titleid']
-        return self._extract_cvp_info(
-            'http://www.%s.com/service/cvpXml?%s' % (domain, query), display_id, {
-                'default': {
-                    'media_src': 'http://ht.cdn.turner.com/%s/big' % site,
-                },
-                'secure': {
-                    'media_src': 'http://androidhls-secure.cdn.turner.com/%s/big' % site,
-                    'tokenizer_src': 'http://www.%s.com/video/processors/services/token_ipadAdobe.do' % domain,
-                },
-            }, {
+        video_data = self._parse_json(self._search_regex(
+            r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>',
+            webpage, 'drupal setting'), display_id)['turner_playlist'][0]
+
+        media_id = video_data['mediaID']
+        title = video_data['title']
+
+        streams_data = self._download_json(
+            'http://medium.ngtv.io/media/%s/tv' % media_id,
+            media_id)['media']['tv']
+        duration = None
+        chapters = []
+        formats = []
+        for supported_type in ('unprotected', 'bulkaes'):
+            stream_data = streams_data.get(supported_type, {})
+            m3u8_url = stream_data.get('secureUrl') or stream_data.get('url')
+            if not m3u8_url:
+                continue
+            if stream_data.get('playlistProtection') == 'spe':
+                m3u8_url = self._add_akamai_spe_token(
+                    'http://www.%s.com/service/token_spe' % site,
+                    m3u8_url, media_id, {
                        'url': url,
                        'site_name': site.upper(),
-                'auth_required': video_params.get('isAuthRequired') != 'false',
+                        'auth_required': video_data.get('authRequired') == '1',
                    })
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False))
+
+            duration = float_or_none(stream_data.get('totalRuntime') or video_data.get('duration'))
+
+            if not chapters:
+                for chapter in stream_data.get('contentSegments', []):
+                    start_time = float_or_none(chapter.get('start'))
+                    duration = float_or_none(chapter.get('duration'))
+                    if start_time is None or duration is None:
+                        continue
+                    chapters.append({
+                        'start_time': start_time,
+                        'end_time': start_time + duration,
+                    })
+        self._sort_formats(formats)
+
+        thumbnails = []
+        for image_id, image in video_data.get('images', {}).items():
+            image_url = image.get('url')
+            if not image_url or image.get('type') != 'video':
+                continue
+            i = {
+                'id': image_id,
+                'url': image_url,
+            }
+            mobj = re.search(r'(\d+)x(\d+)', image_url)
+            if mobj:
+                i.update({
+                    'width': int(mobj.group(1)),
+                    'height': int(mobj.group(2)),
+                })
+            thumbnails.append(i)
+
+        return {
+            'id': media_id,
+            'title': title,
+            'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')),
+            'duration': duration,
+            'timestamp': int_or_none(video_data.get('created')),
+            'season_number': int_or_none(video_data.get('season')),
+            'episode_number': int_or_none(video_data.get('episode')),
+            'cahpters': chapters,
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
--- a/youtube_dl/extractor/turner.py
+++ b/youtube_dl/extractor/turner.py
@ -18,9 +18,32 @@


 class TurnerBaseIE(AdobePassIE):
+    _AKAMAI_SPE_TOKEN_CACHE = {}
+
    def _extract_timestamp(self, video_data):
        return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts'))

+    def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data):
+        secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*'
+        token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path)
+        if not token:
+            query = {
+                'path': secure_path,
+                'videoId': content_id,
+            }
+            if ap_data.get('auth_required'):
+                query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name'])
+            auth = self._download_xml(
+                tokenizer_src, content_id, query=query)
+            error_msg = xpath_text(auth, 'error/msg')
+            if error_msg:
+                raise ExtractorError(error_msg, expected=True)
+            token = xpath_text(auth, 'token')
+            if not token:
+                return video_url
+            self._AKAMAI_SPE_TOKEN_CACHE[secure_path] = token
+        return video_url + '?hdnea=' + token
+
    def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}):
        video_data = self._download_xml(data_src, video_id)
        video_id = video_data.attrib['id']
@ -33,7 +56,6 @@ def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}):
        #         rtmp_src = splited_rtmp_src[1]
        # aifp = xpath_text(video_data, 'akamai/aifp', default='')

-        tokens = {}
        urls = []
        formats = []
        rex = re.compile(
@ -67,26 +89,10 @@ def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}):
                secure_path_data = path_data.get('secure')
                if not secure_path_data:
                    continue
-                video_url = secure_path_data['media_src'] + video_url
-                secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*'
-                token = tokens.get(secure_path)
-                if not token:
-                    query = {
-                        'path': secure_path,
-                        'videoId': content_id,
-                    }
-                    if ap_data.get('auth_required'):
-                        query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], video_id, ap_data['site_name'], ap_data['site_name'])
-                    auth = self._download_xml(
-                        secure_path_data['tokenizer_src'], video_id, query=query)
-                    error_msg = xpath_text(auth, 'error/msg')
-                    if error_msg:
-                        raise ExtractorError(error_msg, expected=True)
-                    token = xpath_text(auth, 'token')
-                    if not token:
-                        continue
-                    tokens[secure_path] = token
-                video_url = video_url + '?hdnea=' + token
+                video_url = self._add_akamai_spe_token(
+                    secure_path_data['tokenizer_src'],
+                    secure_path_data['media_src'] + video_url,
+                    content_id, ap_data)
            elif not re.match('https?://', video_url):
                base_path_data = path_data.get(ext, path_data.get('default', {}))
                media_src = base_path_data.get('media_src')