[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors

2024-12-24 20:25:52 +01:00 · 2015-11-07 16:54:35 +01:00 · 2015-11-07 16:54:35 +01:00 · 3793090b1b
commit 3793090b1b
parent 5d0f84d32c
3 changed files with 105 additions and 108 deletions
--- a/youtube_dl/extractor/amp.py
+++ b/youtube_dl/extractor/amp.py
@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class AMPIE(InfoExtractor):
+    def _get_media_node(self, item, name, default=None):
+        media_name = 'media-%s' % name
+        media_group = item.get('media-group') or item
+        return media_group.get(media_name) or item.get(media_name) or item.get(name, default)
+
+    # parse Akamai Adaptive Media Player feed
+    def _extract_feed_info(self, url):
+        item = self._download_json(
+            url, None,
+            'Downloading Akamai AMP feed',
+            'Unable to download Akamai AMP feed'
+            )['channel']['item']
+
+        video_id = item['guid']
+        
+        thumbnails = []
+        media_thumbnail = self._get_media_node(item, 'thumbnail')
+        if media_thumbnail:
+            if isinstance(media_thumbnail, dict):
+                media_thumbnail = [media_thumbnail]
+            for thumbnail_data in media_thumbnail:
+                thumbnail = thumbnail_data['@attributes']
+                thumbnails.append({
+                    'url': self._proto_relative_url(thumbnail['url'], 'http:'),
+                    'width': int_or_none(thumbnail.get('width')),
+                    'height': int_or_none(thumbnail.get('height')),
+                })
+
+        subtitles = {}
+        media_subtitle = self._get_media_node(item, 'subTitle')
+        if media_subtitle:
+            if isinstance(media_subtitle, dict):
+                media_subtitle = [media_subtitle]
+            for subtitle_data in media_subtitle:
+                subtitle = subtitle_data['@attributes']
+                lang = subtitle.get('lang') or 'en'
+                subtitles[lang] = [{'url': subtitle['href']}]
+
+        formats = []
+        media_content = self._get_media_node(item, 'content')
+        if isinstance(media_content, dict):
+            media_content = [media_content]
+        for media_data in media_content:
+            media = media_data['@attributes']
+            media_type = media['type']
+            if media_type == 'video/f4m':
+                f4m_formats = self._extract_f4m_formats(media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False)
+                if f4m_formats:
+                    formats.extend(f4m_formats)
+            elif media_type == 'application/x-mpegURL':
+                m3u8_formats = self._extract_m3u8_formats(media['url'], video_id, m3u8_id='hls', fatal=False)
+                if m3u8_formats:
+                    formats.extend(m3u8_formats)
+            else:
+                formats.append({
+                    'format_id': media_data['media-category']['@attributes']['label'],
+                    'url': media['url'],
+                    'preference': 1,
+                    'vbr': int_or_none(media.get('bitrate')),
+                    'filesize': int_or_none(media.get('fileSize')),
+                })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': self._get_media_node(item, 'title'),
+            'description': self._get_media_node(item, 'description'),
+            'thumbnails': thumbnails,
+            'timestamp': parse_iso8601(item.get('pubDate'), ' '),
+            'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')),
+            'formats': formats,
+        }
--- a/youtube_dl/extractor/dramafever.py
+++ b/youtube_dl/extractor/dramafever.py
@ -3,7 +3,7 @@

 import itertools

-from .common import InfoExtractor
+from .amp import AMPIE
 from ..compat import (
    compat_HTTPError,
    compat_urllib_parse,
@ -19,7 +19,7 @@
 )


-class DramaFeverBaseIE(InfoExtractor):
+class DramaFeverBaseIE(AMPIE):
    _LOGIN_URL = 'https://www.dramafever.com/accounts/login/'
    _NETRC_MACHINE = 'dramafever'

@ -80,60 +80,24 @@ class DramaFeverIE(DramaFeverBaseIE):
            'timestamp': 1404336058,
            'upload_date': '20140702',
            'duration': 343,
-        }
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
    }

    def _real_extract(self, url):
        video_id = self._match_id(url).replace('/', '.')

        try:
-            feed = self._download_json(
-                'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id,
-                video_id, 'Downloading episode JSON')['channel']['item']
+            info = self._extract_feed_info('http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id)
        except ExtractorError as e:
            if isinstance(e.cause, compat_HTTPError):
                raise ExtractorError(
                    'Currently unavailable in your country.', expected=True)
            raise

-        media_group = feed.get('media-group', {})
-
-        formats = []
-        for media_content in media_group['media-content']:
-            src = media_content.get('@attributes', {}).get('url')
-            if not src:
-                continue
-            ext = determine_ext(src)
-            if ext == 'f4m':
-                formats.extend(self._extract_f4m_formats(
-                    src, video_id, f4m_id='hds'))
-            elif ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
-                    src, video_id, 'mp4', m3u8_id='hls'))
-            else:
-                formats.append({
-                    'url': src,
-                })
-        self._sort_formats(formats)
-
-        title = media_group.get('media-title')
-        description = media_group.get('media-description')
-        duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration'))
-        thumbnail = self._proto_relative_url(
-            media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url'))
-        timestamp = parse_iso8601(feed.get('pubDate'), ' ')
-
-        subtitles = {}
-        for media_subtitle in media_group.get('media-subTitle', []):
-            lang = media_subtitle.get('@attributes', {}).get('lang')
-            href = media_subtitle.get('@attributes', {}).get('href')
-            if not lang or not href:
-                continue
-            subtitles[lang] = [{
-                'ext': 'ttml',
-                'url': href,
-            }]
-
        series_id, episode_number = video_id.split('.')
        episode_info = self._download_json(
            # We only need a single episode info, so restricting page size to one episode
@ -146,21 +110,12 @@ def _real_extract(self, url):
            if value:
                subfile = value[0].get('subfile') or value[0].get('new_subfile')
                if subfile and subfile != 'http://www.dramafever.com/st/':
-                    subtitles.setdefault('English', []).append({
+                    info['subtitiles'].setdefault('English', []).append({
                        'ext': 'srt',
                        'url': subfile,
                    })

-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'timestamp': timestamp,
-            'duration': duration,
-            'formats': formats,
-            'subtitles': subtitles,
-        }
+        return info


 class DramaFeverSeriesIE(DramaFeverBaseIE):
--- a/youtube_dl/extractor/foxnews.py
+++ b/youtube_dl/extractor/foxnews.py
@ -2,14 +2,14 @@

 import re

-from .common import InfoExtractor
+from .amp import AMPIE
 from ..utils import (
    parse_iso8601,
    int_or_none,
 )


-class FoxNewsIE(InfoExtractor):
+class FoxNewsIE(AMPIE):
    IE_DESC = 'Fox News and Fox Business Video'
    _VALID_URL = r'https?://(?P<host>video\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)'
    _TESTS = [
@ -20,10 +20,10 @@ class FoxNewsIE(InfoExtractor):
                'id': '3937480',
                'ext': 'flv',
                'title': 'Frozen in Time',
-                'description': 'Doctors baffled by 16-year-old girl that is the size of a toddler',
+                'description': '16-year-old girl is size of toddler',
                'duration': 265,
-                'timestamp': 1304411491,
-                'upload_date': '20110503',
+                #'timestamp': 1304411491,
+                #'upload_date': '20110503',
                'thumbnail': 're:^https?://.*\.jpg$',
            },
        },
@ -34,10 +34,10 @@ class FoxNewsIE(InfoExtractor):
                'id': '3922535568001',
                'ext': 'mp4',
                'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal",
-                'description': "Congressman discusses the president's executive action",
+                'description': "Congressman discusses president's plan",
                'duration': 292,
-                'timestamp': 1417662047,
-                'upload_date': '20141204',
+                #'timestamp': 1417662047,
+                #'upload_date': '20141204',
                'thumbnail': 're:^https?://.*\.jpg$',
            },
        },
@ -56,48 +56,6 @@ def _real_extract(self, url):
        video_id = mobj.group('id')
        host = mobj.group('host')

-        video = self._download_json(
-            'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id), video_id)
-
-        item = video['channel']['item']
-        title = item['title']
-        description = item['description']
-        timestamp = parse_iso8601(item['dc-date'])
-
-        media_group = item['media-group']
-        duration = None
-        formats = []
-        for media in media_group['media-content']:
-            attributes = media['@attributes']
-            video_url = attributes['url']
-            if video_url.endswith('.f4m'):
-                formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id))
-            elif video_url.endswith('.m3u8'):
-                formats.extend(self._extract_m3u8_formats(video_url, video_id, 'flv'))
-            elif not video_url.endswith('.smil'):
-                duration = int_or_none(attributes.get('duration'))
-                formats.append({
-                    'url': video_url,
-                    'format_id': media['media-category']['@attributes']['label'],
-                    'preference': 1,
-                    'vbr': int_or_none(attributes.get('bitrate')),
-                    'filesize': int_or_none(attributes.get('fileSize'))
-                })
-        self._sort_formats(formats)
-
-        media_thumbnail = media_group['media-thumbnail']['@attributes']
-        thumbnails = [{
-            'url': media_thumbnail['url'],
-            'width': int_or_none(media_thumbnail.get('width')),
-            'height': int_or_none(media_thumbnail.get('height')),
-        }] if media_thumbnail else []
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'duration': duration,
-            'timestamp': timestamp,
-            'formats': formats,
-            'thumbnails': thumbnails,
-        }
+        info = self._extract_feed_info('http://%s/v/feed/video/%s.js?template=fox' % (host, video_id))
+        info['id'] = video_id
+        return info