[extractor/common] Extract timestamp from Last-Modified header

[extractor/common] Add support for dl8-* media tags (closes #27283 )
[extractor/common] Eliminate media tag name regex duplication
2024-07-27 18:33:31 +02:00 · 2020-12-07 01:15:30 +07:00 · 2020-12-07 01:08:22 +07:00 · 2020-12-07 00:56:29 +07:00 · 2020-12-07 00:45:16 +07:00
2 changed files with 9 additions and 5 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -2513,16 +2513,18 @@ class InfoExtractor(object):
        # amp-video and amp-audio are very similar to their HTML5 counterparts
        # so we wll include them right here (see
        # https://www.ampproject.org/docs/reference/components/amp-video)
+        # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
+        _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
        media_tags = [(media_tag, media_type, '')
                      for media_tag, media_type
-                      in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
+                      in re.findall(r'(?s)(<%s[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
        media_tags.extend(re.findall(
            # We only allow video|audio followed by a whitespace or '>'.
            # Allowing more characters may end up in significant slow down (see
            # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
            # http://www.porntrex.com/maps/videositemap.xml).
-            r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
-        for media_tag, media_type, media_content in media_tags:
+            r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
+        for media_tag, _, media_type, media_content in media_tags:
            media_info = {
                'formats': [],
                'subtitles': {},
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -2360,7 +2360,7 @@ class GenericIE(InfoExtractor):
        info_dict = {
            'id': video_id,
            'title': self._generic_title(url),
-            'upload_date': unified_strdate(head_response.headers.get('Last-Modified'))
+            'timestamp': unified_timestamp(head_response.headers.get('Last-Modified'))
        }

        # Check for direct link to a video
@ -2466,7 +2466,9 @@ class GenericIE(InfoExtractor):
        # Sometimes embedded video player is hidden behind percent encoding
        # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
        # Unescaping the whole page allows to handle those cases in a generic way
-        webpage = compat_urllib_parse_unquote(webpage)
+        # FIXME: unescaping the whole page may break URLs, commenting out for now.
+        # There probably should be a second run of generic extractor on unescaped webpage.
+        # webpage = compat_urllib_parse_unquote(webpage)

        # Unescape squarespace embeds to be detected by generic extractor,
        # see https://github.com/ytdl-org/youtube-dl/issues/21294
Author	SHA1	Message	Date
Sergey M․	dccf4932e1	[extractor/common] Extract timestamp from Last-Modified header	2020-12-07 01:15:30 +07:00
Sergey M․	91dd25fe1e	[extractor/common] Add support for dl8-* media tags (closes #27283 )	2020-12-07 01:08:22 +07:00
Sergey M․	06bf2ac20f	[extractor/common] Eliminate media tag name regex duplication	2020-12-07 00:56:29 +07:00
Sergey M․	6ad0d8781e	[extractor/common] Fix media type extraction for HTML5 media tags in start/end form	2020-12-07 00:45:16 +07:00